diff --git a/1001-config-add-syslog-ng-and-logrotate-config.patch b/1001-config-add-syslog-ng-and-logrotate-config.patch new file mode 100644 index 0000000000000000000000000000000000000000..b540055656d82164142d04775163ec55859d4f8c --- /dev/null +++ b/1001-config-add-syslog-ng-and-logrotate-config.patch @@ -0,0 +1,203 @@ +From 6949adcd0e7595000b882d57ebc7e3f47c40508e Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 18 Mar 2025 16:24:56 +0800 +Subject: [PATCH 01/30] config: add syslog-ng and logrotate config + +redirect all rasdaemon log to /var/log/rasdaemon and config logrotate, +add related modification in rasdaemon.spec + +The patch does not directly add a dependency on the syslog-ng package +to the rasdaemon RPM. Instead, it dynamically checks whether the +syslog-ng service is running during installation and configures accordingly. + +Signed-off-by: Bing Wu +Signed-off-by: Ruidong Tian +--- + Makefile.am | 31 +++++++++++++++++++++----- + man/rasdaemon.1.in | 3 ++- + misc/rasdaemon.logrotate.in | 14 ++++++++++++ + misc/rasdaemon.spec.in | 43 +++++++++++++++++++++++++++++++++---- + misc/rasdaemon.syslog-ng.in | 7 ++++++ + 6 files changed, 90 insertions(+), 10 deletions(-) + create mode 100644 misc/rasdaemon.logrotate.in + create mode 100644 misc/rasdaemon.syslog-ng.in + +diff --git a/Makefile.am b/Makefile.am +index 01132fe..a1f6edf 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -5,27 +5,42 @@ ACLOCAL_AMFLAGS=-I m4 + SUBDIRS = util man + SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in + SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service) ++SYSLOG_SERVICES_IN = misc/rasdaemon.syslog-ng.in ++SYSLOG_SERVICES = $(SYSLOG_SERVICES_IN:.syslog-ng.in=.syslog-ng) ++LOGROTATE_SERVICES_IN = misc/rasdaemon.logrotate.in ++LOGROTATE_SERVICES = $(LOGROTATE_SERVICES_IN:.logrotate.in=.logrotate) + EXTRA_DIST = \ +- $(SYSTEMD_SERVICES_IN) misc/rasdaemon.env \ ++ $(SYSTEMD_SERVICES_IN) \ ++ $(SYSLOG_SERVICES_IN) \ ++ $(LOGROTATE_SERVICES_IN) \ ++ misc/rasdaemon.env \ + contrib/mc_event_trigger \ + contrib/mem_fail_trigger + + CLEANFILES= \ + misc/ras-mc-ctl.service \ +- misc/rasdaemon.service ++ misc/rasdaemon.service \ ++ misc/rasdaemon.syslog-ng \ ++ misc/rasdaemon.logrotate + + DISTCLEANFILES = misc/rasdaemon.spec + + # This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin + # during ./configure phase, therefore it is not possible to add .service.in + # files to AC_CONFIG_FILES in configure.ac +-SUFFIXES = .service.in .service ++SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng + .service.in.service: + sed -e s,\@sbindir\@,$(sbindir),g -e s,\@SYSCONFDEFDIR\@,@SYSCONFDEFDIR@,g $< > $@ + ++.logrotate.in.logrotate: ++ sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@ ++ ++.syslog-ng.in.syslog-ng: ++ sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@ ++ + # This rule is needed because the service files must be generated on target + # system after ./configure phase +-all-local: $(SYSTEMD_SERVICES) ++all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(LOGROTATE_SERVICES) + + sbin_PROGRAMS = rasdaemon + rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ +@@ -128,6 +143,12 @@ upload: + install-data-local: + $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d" + $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/triggers" +- $(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon" ++ install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon" + $(install_sh) @abs_srcdir@/contrib/mc_event_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mc_event_trigger" + $(install_sh) @abs_srcdir@/contrib/mem_fail_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mem_fail_trigger" ++ if [ -d "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d" ]; then \ ++ install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.syslog-ng "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/rasdaemon.conf"; \ ++ fi ++ if [ -d "$(DESTDIR)@sysconfdir@/logrotate.d" ]; then \ ++ install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.logrotate "$(DESTDIR)@sysconfdir@/logrotate.d/rasdaemon"; \ ++ fi +diff --git a/man/rasdaemon.1.in b/man/rasdaemon.1.in +index 7cfef54..e884e55 100644 +--- a/man/rasdaemon.1.in ++++ b/man/rasdaemon.1.in +@@ -34,7 +34,8 @@ rasdaemon \- RAS daemon to log the RAS events. + The \fBrasdaemon\fR program is a daemon which monitors the platform + Reliablity, Availability and Serviceability (RAS) reports from the + Linux kernel trace events. These trace events are logged in +-/sys/kernel/debug/tracing, reporting them via syslog/journald. ++/sys/kernel/debug/tracing, reporting them via syslog/journald. If ++syslog-ng is installed, the events will logged at @localstatedir@/log/rasdaemon. + + .SH OPTIONS + .TP +diff --git a/misc/rasdaemon.logrotate.in b/misc/rasdaemon.logrotate.in +new file mode 100644 +index 0000000..b7b62fe +--- /dev/null ++++ b/misc/rasdaemon.logrotate.in +@@ -0,0 +1,14 @@ ++@localstatedir@/log/rasdaemon { ++ compress ++ monthly ++ size 100M ++ dateext ++ rotate 4 ++ notifempty ++ missingok ++ copytruncate ++ sharedscripts ++ postrotate ++ @sbindir@/systemctl kill -s HUP syslog-ng.service >/dev/null 2>&1 || true ++ endscript ++} +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index 32c69b7..8ab3d50 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -49,20 +49,55 @@ make %{?_smp_mflags} + + %install + make install DESTDIR=%{buildroot} +-install -D -p -m 0644 misc/rasdaemon.service %{buildroot}%{_unitdir}/rasdaemon.service ++install -D -p -m 0644 misc/%{name}.service %{buildroot}%{_unitdir}/%{name}.service + install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service +-install -D -p -m 0655 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} ++install -D -p -m 0655 misc/%{name}.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} ++install -D -p -m 0655 misc/%{name}.syslog-ng %{buildroot}/usr/share/%{name}/%{name}.syslog-ng ++install -D -p -m 0655 misc/%{name}.logrotate %{buildroot}/usr/share/%{name}/%{name}.logrotate + rm INSTALL %{buildroot}/usr/include/*.h + + %files +-%doc AUTHORS ChangeLog COPYING README.md TODO +-%{_sbindir}/rasdaemon ++%doc AUTHORS ChangeLog COPYING TODO ++%{_sbindir}/%{name} + %{_sbindir}/ras-mc-ctl + %{_mandir}/*/* + %{_unitdir}/*.service + %{_sysconfdir}/ras/dimm_labels.d + %{_sysconfdir}/ras/*/* + %config(noreplace) %{_sysconfdir}/sysconfig/%{name} ++%config(noreplace) /usr/share/%{name}/%{name}.syslog-ng ++%config(noreplace) /usr/share/%{name}/%{name}.logrotate ++ ++%post ++if systemctl is-enabled --quiet syslog-ng.service && systemctl is-active --quiet syslog-ng.service; then ++ echo "Syslog service is enabled and running, create config file and restart it"; ++ rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; ++ ln -s /usr/share/%{name}/%{name}.syslog-ng %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; ++ systemctl restart syslog-ng.service; ++fi ++if [ -d "%{_sysconfdir}/logrotate.d" ]; then ++ rm -rf %{_sysconfdir}/logrotate.d/%{name}; ++ ln -s /usr/share/%{name}/%{name}.logrotate %{_sysconfdir}/logrotate.d/%{name}; ++fi ++if ! systemctl is-enabled --quiet %{name}.service; then ++ echo "Rasdaemon service is not enabled, enable it"; ++ systemctl enable %{name}.service; ++fi ++systemctl restart %{name}.service ++ ++%preun ++systemctl stop %{name}.service ++systemctl disable %{name}.service ++ ++%postun ++if systemctl is-enabled --quiet syslog-ng.service && systemctl is-active --quiet syslog-ng.service; then ++ echo "Syslog service is enabled and running, delete config file and restart it"; ++ rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; ++ systemctl restart syslog-ng.service; ++fi ++if [ -d "%{_sysconfdir}/logrotate.d" ]; then ++ rm -rf %{_sysconfdir}/logrotate.d/%{name}; ++fi + + %changelog + +diff --git a/misc/rasdaemon.syslog-ng.in b/misc/rasdaemon.syslog-ng.in +new file mode 100644 +index 0000000..b3308f8 +--- /dev/null ++++ b/misc/rasdaemon.syslog-ng.in +@@ -0,0 +1,7 @@ ++# SPDX-License-Identifier: GPL-2.0 ++ ++destination d_rasdaemon { file("@localstatedir@/log/rasdaemon" persist-name(rasdaemon-syslog)); }; ++ ++filter f_rasdaemon { program("rasdaemon"); }; ++ ++log { source(s_sys); filter(f_rasdaemon); destination(d_rasdaemon); }; +-- +2.43.5 + diff --git a/1001-rasdaemon-mce-amd-smca-properly-limit-bank-types.patch b/1001-rasdaemon-mce-amd-smca-properly-limit-bank-types.patch deleted file mode 100644 index 9255ac9858729353871f4044d75560725b15e553..0000000000000000000000000000000000000000 --- a/1001-rasdaemon-mce-amd-smca-properly-limit-bank-types.patch +++ /dev/null @@ -1,29 +0,0 @@ -From 1eb161a1c0ed47d1e260956f9bd9fb4beff81d3c Mon Sep 17 00:00:00 2001 -From: Aristeu Rozanski -Date: Thu, 19 Jan 2023 08:45:57 -0500 -Subject: [PATCH 01/85] rasdaemon: mce-amd-smca: properly limit bank types - -Found with covscan. - -Signed-off-by: Aristeu Rozanski -Signed-off-by: Mauro Carvalho Chehab ---- - mce-amd-smca.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/mce-amd-smca.c b/mce-amd-smca.c -index 7cc596e..233fa0a 100644 ---- a/mce-amd-smca.c -+++ b/mce-amd-smca.c -@@ -931,7 +931,7 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m) - return; - } - -- if (bank_type >= MAX_NR_BANKS) { -+ if (bank_type >= N_SMCA_BANK_TYPES) { - strcpy(e->mcastatus_msg, "Don't know how to decode this bank"); - return; - } --- -2.33.1 - diff --git a/1002-config-add-rsyslog-config.patch b/1002-config-add-rsyslog-config.patch new file mode 100644 index 0000000000000000000000000000000000000000..8ad5777c21a1c2b7436c8e7ff6357027c9d7fddf --- /dev/null +++ b/1002-config-add-rsyslog-config.patch @@ -0,0 +1,160 @@ +From f1f27e8f90a0be341e367a40962f6f7103504659 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 15 Apr 2025 11:18:02 +0800 +Subject: [PATCH 02/30] config: add rsyslog config + +redirect all rasdaemon log to /var/log/rasdaemon, +add related modification in rasdaemon.spec + +The patch does not directly add a dependency on the rsyslog package +to the rasdaemon RPM. Instead, it dynamically checks whether the +rsyslog service is running during installation and configures accordingly. + +Signed-off-by: Bing Wu +Signed-off-by: Ruidong Tian +--- + Makefile.am | 14 ++++++++++++-- + misc/rasdaemon.logrotate.in | 3 ++- + misc/rasdaemon.rsyslog.in | 3 +++ + misc/rasdaemon.spec.in | 19 ++++++++++++++++--- + 5 files changed, 34 insertions(+), 6 deletions(-) + create mode 100644 misc/rasdaemon.rsyslog.in + +diff --git a/Makefile.am b/Makefile.am +index a1f6edf..e3e66bb 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -9,9 +9,12 @@ SYSLOG_SERVICES_IN = misc/rasdaemon.syslog-ng.in + SYSLOG_SERVICES = $(SYSLOG_SERVICES_IN:.syslog-ng.in=.syslog-ng) + LOGROTATE_SERVICES_IN = misc/rasdaemon.logrotate.in + LOGROTATE_SERVICES = $(LOGROTATE_SERVICES_IN:.logrotate.in=.logrotate) ++RSYSLOG_SERVICES_IN = misc/rasdaemon.rsyslog.in ++RSYSLOG_SERVICES = $(RSYSLOG_SERVICES_IN:.rsyslog.in=.rsyslog) + EXTRA_DIST = \ + $(SYSTEMD_SERVICES_IN) \ + $(SYSLOG_SERVICES_IN) \ ++ $(RSYSLOG_SERVICES_IN) \ + $(LOGROTATE_SERVICES_IN) \ + misc/rasdaemon.env \ + contrib/mc_event_trigger \ +@@ -21,6 +24,7 @@ CLEANFILES= \ + misc/ras-mc-ctl.service \ + misc/rasdaemon.service \ + misc/rasdaemon.syslog-ng \ ++ misc/rasdaemon.rsyslog \ + misc/rasdaemon.logrotate + + DISTCLEANFILES = misc/rasdaemon.spec +@@ -28,7 +32,7 @@ DISTCLEANFILES = misc/rasdaemon.spec + # This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin + # during ./configure phase, therefore it is not possible to add .service.in + # files to AC_CONFIG_FILES in configure.ac +-SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng ++SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng .rsyslog.in .rsyslog + .service.in.service: + sed -e s,\@sbindir\@,$(sbindir),g -e s,\@SYSCONFDEFDIR\@,@SYSCONFDEFDIR@,g $< > $@ + +@@ -38,9 +42,12 @@ SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-n + .syslog-ng.in.syslog-ng: + sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@ + ++.rsyslog.in.rsyslog: ++ sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@ ++ + # This rule is needed because the service files must be generated on target + # system after ./configure phase +-all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(LOGROTATE_SERVICES) ++all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(RSYSLOG_SERVICES) $(LOGROTATE_SERVICES) + + sbin_PROGRAMS = rasdaemon + rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ +@@ -149,6 +156,9 @@ install-data-local: + if [ -d "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d" ]; then \ + install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.syslog-ng "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/rasdaemon.conf"; \ + fi ++ if [ -d "$(DESTDIR)@sysconfdir@/rsyslog.d/" ]; then \ ++ install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.rsyslog "$(DESTDIR)@sysconfdir@/rsyslog.d/rasdaemon.conf"; \ ++ fi + if [ -d "$(DESTDIR)@sysconfdir@/logrotate.d" ]; then \ + install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.logrotate "$(DESTDIR)@sysconfdir@/logrotate.d/rasdaemon"; \ + fi +diff --git a/misc/rasdaemon.logrotate.in b/misc/rasdaemon.logrotate.in +index b7b62fe..ca188ba 100644 +--- a/misc/rasdaemon.logrotate.in ++++ b/misc/rasdaemon.logrotate.in +@@ -9,6 +9,7 @@ + copytruncate + sharedscripts + postrotate +- @sbindir@/systemctl kill -s HUP syslog-ng.service >/dev/null 2>&1 || true ++ (@sbindir@/systemctl is-active --quiet syslog-ng.service && @sbindir@/systemctl kill -s HUP syslog-ng.service >/dev/null 2>&1) || true ++ (@sbindir@/systemctl is-active --quiet rsyslog.service &&@sbindir@/systemctl kill -s HUP rsyslog.service >/dev/null 2>&1) || true + endscript + } +diff --git a/misc/rasdaemon.rsyslog.in b/misc/rasdaemon.rsyslog.in +new file mode 100644 +index 0000000..d1a5cf1 +--- /dev/null ++++ b/misc/rasdaemon.rsyslog.in +@@ -0,0 +1,3 @@ ++# SPDX-License-Identifier: GPL-2.0 ++ ++:programname, isequal, "rasdaemon" @localstatedir@/log/rasdaemon +\ No newline at end of file +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index 8ab3d50..4cc859f 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -54,6 +54,7 @@ install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl + install -D -p -m 0655 misc/%{name}.env %{buildroot}%{_sysconfdir}/sysconfig/%{name} + install -D -p -m 0655 misc/%{name}.syslog-ng %{buildroot}/usr/share/%{name}/%{name}.syslog-ng + install -D -p -m 0655 misc/%{name}.logrotate %{buildroot}/usr/share/%{name}/%{name}.logrotate ++install -D -p -m 0655 misc/%{name}.rsyslog %{buildroot}/usr/share/%{name}/%{name}.rsyslog + rm INSTALL %{buildroot}/usr/include/*.h + + %files +@@ -67,14 +68,21 @@ rm INSTALL %{buildroot}/usr/include/*.h + %config(noreplace) %{_sysconfdir}/sysconfig/%{name} + %config(noreplace) /usr/share/%{name}/%{name}.syslog-ng + %config(noreplace) /usr/share/%{name}/%{name}.logrotate ++%config(noreplace) /usr/share/%{name}/%{name}.rsyslog + + %post +-if systemctl is-enabled --quiet syslog-ng.service && systemctl is-active --quiet syslog-ng.service; then ++if systemctl is-active --quiet syslog-ng.service; then + echo "Syslog service is enabled and running, create config file and restart it"; + rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; + ln -s /usr/share/%{name}/%{name}.syslog-ng %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; + systemctl restart syslog-ng.service; + fi ++if systemctl is-active --quiet rsyslog.service; then ++ echo "Rsyslog service is enabled and running, create config file and restart it"; ++ rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf; ++ ln -s /usr/share/%{name}/%{name}.rsyslog %{_sysconfdir}/rsyslog.d/%{name}.conf; ++ systemctl restart rsyslog.service; ++fi + if [ -d "%{_sysconfdir}/logrotate.d" ]; then + rm -rf %{_sysconfdir}/logrotate.d/%{name}; + ln -s /usr/share/%{name}/%{name}.logrotate %{_sysconfdir}/logrotate.d/%{name}; +@@ -90,11 +98,16 @@ systemctl stop %{name}.service + systemctl disable %{name}.service + + %postun +-if systemctl is-enabled --quiet syslog-ng.service && systemctl is-active --quiet syslog-ng.service; then +- echo "Syslog service is enabled and running, delete config file and restart it"; ++if systemctl is-active --quiet syslog-ng.service; then ++ echo "Syslog-ng service is enabled and running, delete config file and restart it"; + rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; + systemctl restart syslog-ng.service; + fi ++if systemctl is-active --quiet rsyslog.service; then ++ echo "Rsyslog service is enabled and running, delete config file and restart it"; ++ rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf; ++ systemctl restart rsyslog.service; ++fi + if [ -d "%{_sysconfdir}/logrotate.d" ]; then + rm -rf %{_sysconfdir}/logrotate.d/%{name}; + fi +-- +2.43.5 + diff --git a/1002-rasdaemon-ras-memory-failure-handler-handle-localtim.patch b/1002-rasdaemon-ras-memory-failure-handler-handle-localtim.patch deleted file mode 100644 index 60dc329568587942766d7fd8f4fbc2bab7d4e00c..0000000000000000000000000000000000000000 --- a/1002-rasdaemon-ras-memory-failure-handler-handle-localtim.patch +++ /dev/null @@ -1,34 +0,0 @@ -From 76846ec3b8740794b5c75934e8a24c07e6cf70bd Mon Sep 17 00:00:00 2001 -From: Aristeu Rozanski -Date: Thu, 19 Jan 2023 08:45:57 -0500 -Subject: [PATCH 02/85] rasdaemon: ras-memory-failure-handler: handle - localtime() failure correctly - -We could just have an empty string but keeping the format could prevent -issues if someone is actually parsing this. -Found with covscan. - -v2: fixed the timestamp as pointed by Robert Elliott - -Signed-off-by: Aristeu Rozanski -Signed-off-by: Mauro Carvalho Chehab ---- - ras-memory-failure-handler.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c -index 9941e68..1951456 100644 ---- a/ras-memory-failure-handler.c -+++ b/ras-memory-failure-handler.c -@@ -148,6 +148,8 @@ int ras_memory_failure_event_handler(struct trace_seq *s, - if (tm) - strftime(ev.timestamp, sizeof(ev.timestamp), - "%Y-%m-%d %H:%M:%S %z", tm); -+ else -+ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); - trace_seq_printf(s, "%s ", ev.timestamp); - - if (pevent_get_field_val(s, event, "pfn", record, &val, 1) < 0) --- -2.33.1 - diff --git a/1003-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch b/1003-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch deleted file mode 100644 index fd982e9975c5fb8004fd5a5d5aee04f9dfe9d26c..0000000000000000000000000000000000000000 --- a/1003-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch +++ /dev/null @@ -1,939 +0,0 @@ -From 97a88dfbb3d1db9320618c2116e5dca06677c2ea Mon Sep 17 00:00:00 2001 -From: Shengwei Luo -Date: Wed, 23 Feb 2022 17:21:58 +0800 -Subject: [PATCH 03/85] rasdaemon: Support cpu fault isolation for corrected - errors - -When the corrected errors exceed the set limit in cycle, try to -offline the related cpu core. - -Signed-off-by: Shengwei Luo -Signed-off-by: Junchong Pan -Signed-off-by: Lei Feng -Signed-off-by: Xiaofei Tan -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - Makefile.am | 6 +- - configure.ac | 11 ++ - misc/rasdaemon.env | 17 ++ - queue.c | 119 ++++++++++++++ - queue.h | 39 +++++ - ras-arm-handler.c | 97 +++++++++++ - ras-arm-handler.h | 18 ++ - ras-cpu-isolation.c | 388 ++++++++++++++++++++++++++++++++++++++++++++ - ras-cpu-isolation.h | 67 ++++++++ - ras-events.c | 9 +- - 10 files changed, 769 insertions(+), 2 deletions(-) - create mode 100644 queue.c - create mode 100644 queue.h - create mode 100644 ras-cpu-isolation.c - create mode 100644 ras-cpu-isolation.h - -diff --git a/Makefile.am b/Makefile.am -index fabca78..242ceb7 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -63,13 +63,17 @@ endif - if WITH_AMP_NS_DECODE - rasdaemon_SOURCES += non-standard-ampere.c - endif -+if WITH_CPU_FAULT_ISOLATION -+ rasdaemon_SOURCES += ras-cpu-isolation.c queue.c -+endif - rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a - - include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ - ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ - ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ -- non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h -+ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ -+ ras-cpu-isolation.h queue.h - - # This rule can't be called with more than one Makefile job (like make -j8) - # I can't figure out a way to fix that -diff --git a/configure.ac b/configure.ac -index 33b81fe..d098fcf 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"], - AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all == xyes]) - AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"]) - -+AC_ARG_ENABLE([cpu_fault_isolation], -+ AS_HELP_STRING([--enable-cpu-fault-isolation], [enable cpu online fault isolation])) -+ -+AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "xyes"], [ -+ AC_DEFINE(HAVE_CPU_FAULT_ISOLATION,1,"have cpu online fault isolation") -+ AC_SUBST([WITH_CPU_FAULT_ISOLATION]) -+]) -+AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes]) -+AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"]) -+ - test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc - - CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" -@@ -201,4 +211,5 @@ compile time options summary - Memory Failure : $USE_MEMORY_FAILURE - Memory CE PFA : $USE_MEMORY_CE_PFA - AMP RAS errors : $USE_AMP_NS_DECODE -+ CPU fault isolation : $USE_CPU_FAULT_ISOLATION - EOF -diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 12fd766..7cb18e8 100644 ---- a/misc/rasdaemon.env -+++ b/misc/rasdaemon.env -@@ -27,3 +27,20 @@ PAGE_CE_THRESHOLD="50" - # soft-then-hard First try to soft offline, then try hard offlining. - # Note: default offline choice is "soft". - PAGE_CE_ACTION="soft" -+ -+# CPU Online Fault Isolation -+# Whether to enable cpu online fault isolation (yes|no). -+CPU_ISOLATION_ENABLE="no" -+# Specify the threshold of CE numbers. -+# -+# Format: -+# [0-9]+[unit] -+# -+# Supported units: -+# CPU_CE_THRESHOLD: no unit -+# CPU_ISOLATION_CYCLE: D|d (day), H|h (hour), M|m (minute), S|s (second), default is in second -+CPU_CE_THRESHOLD="18" -+CPU_ISOLATION_CYCLE="24h" -+ -+# Prevent excessive isolation from causing an avalanche effect -+CPU_ISOLATION_LIMIT="10" -\ No newline at end of file -diff --git a/queue.c b/queue.c -new file mode 100644 -index 0000000..65b6fb8 ---- /dev/null -+++ b/queue.c -@@ -0,0 +1,119 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+#include -+#include -+#include "queue.h" -+#include "ras-logger.h" -+ -+int is_empty(struct link_queue *queue) -+{ -+ if (queue) -+ return queue->size == 0; -+ -+ return 1; -+} -+ -+struct link_queue *init_queue(void) -+{ -+ struct link_queue *queue = NULL; -+ -+ queue = (struct link_queue *)malloc(sizeof(struct link_queue)); -+ if (queue == NULL) { -+ log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n"); -+ return NULL; -+ } -+ -+ queue->size = 0; -+ queue->head = NULL; -+ queue->tail = NULL; -+ -+ return queue; -+} -+ -+void clear_queue(struct link_queue *queue) -+{ -+ if (queue == NULL) -+ return; -+ -+ struct queue_node *node = queue->head; -+ struct queue_node *tmp = NULL; -+ -+ while (node != NULL) { -+ tmp = node; -+ node = node->next; -+ free(tmp); -+ } -+ -+ queue->head = NULL; -+ queue->tail = NULL; -+ queue->size = 0; -+} -+ -+void free_queue(struct link_queue *queue) -+{ -+ clear_queue(queue); -+ -+ if (queue) -+ free(queue); -+} -+ -+/* It should be guranteed that the param is not NULL */ -+void push(struct link_queue *queue, struct queue_node *node) -+{ -+ /* there is no element in the queue */ -+ if (queue->head == NULL) -+ queue->head = node; -+ else -+ queue->tail->next = node; -+ -+ queue->tail = node; -+ (queue->size)++; -+} -+ -+int pop(struct link_queue *queue) -+{ -+ struct queue_node *tmp = NULL; -+ -+ if (queue == NULL || is_empty(queue)) -+ return -1; -+ -+ tmp = queue->head; -+ queue->head = queue->head->next; -+ free(tmp); -+ (queue->size)--; -+ -+ return 0; -+} -+ -+struct queue_node *front(struct link_queue *queue) -+{ -+ if (queue == NULL) -+ return NULL; -+ -+ return queue->head; -+} -+ -+struct queue_node *node_create(time_t time, unsigned int value) -+{ -+ struct queue_node *node = NULL; -+ -+ node = (struct queue_node *)malloc(sizeof(struct queue_node)); -+ if (node != NULL) { -+ node->time = time; -+ node->value = value; -+ node->next = NULL; -+ } -+ -+ return node; -+} -diff --git a/queue.h b/queue.h -new file mode 100644 -index 0000000..5459f40 ---- /dev/null -+++ b/queue.h -@@ -0,0 +1,39 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#ifndef __RAS_QUEUE_H -+#define __RAS_QUEUE_H -+ -+struct queue_node { -+ time_t time; -+ unsigned int value; -+ struct queue_node *next; -+}; -+ -+struct link_queue { -+ struct queue_node *head; -+ struct queue_node *tail; -+ int size; -+}; -+ -+int is_empty(struct link_queue *queue); -+struct link_queue *init_queue(void); -+void clear_queue(struct link_queue *queue); -+void free_queue(struct link_queue *queue); -+void push(struct link_queue *queue, struct queue_node *node); -+int pop(struct link_queue *queue); -+struct queue_node *front(struct link_queue *queue); -+struct queue_node *node_create(time_t time, unsigned int value); -+ -+#endif -diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 1149dc6..9c7a3c3 100644 ---- a/ras-arm-handler.c -+++ b/ras-arm-handler.c -@@ -22,6 +22,10 @@ - #include "ras-report.h" - #include "ras-non-standard-handler.h" - #include "non-standard-ampere.h" -+#include "ras-cpu-isolation.h" -+ -+#define ARM_ERR_VALID_ERROR_COUNT BIT(0) -+#define ARM_ERR_VALID_FLAGS BIT(1) - - void display_raw_data(struct trace_seq *s, - const uint8_t *buf, -@@ -42,6 +46,93 @@ void display_raw_data(struct trace_seq *s, - } - } - -+#ifdef HAVE_CPU_FAULT_ISOLATION -+static int count_errors(struct ras_arm_event *ev) -+{ -+ struct ras_arm_err_info *err_info; -+ int num_pei; -+ int err_info_size = sizeof(struct ras_arm_err_info); -+ int num = 0; -+ int i; -+ int error_count; -+ -+ if (ev->pei_len % err_info_size != 0) { -+ log(TERM, LOG_ERR, -+ "The event data does not match to the ARM Processor Error Information Structure\n"); -+ return num; -+ } -+ num_pei = ev->pei_len / err_info_size; -+ err_info = (struct ras_arm_err_info *)(ev->pei_error); -+ -+ for (i = 0; i < num_pei; ++i) { -+ error_count = 1; -+ if (err_info->validation_bits & ARM_ERR_VALID_ERROR_COUNT) { -+ /* -+ * The value of this field is defined as follows: -+ * 0: Single Error -+ * 1: Multiple Errors -+ * 2-65535: Error Count -+ */ -+ error_count = err_info->multiple_error + 1; -+ } -+ -+ num += error_count; -+ err_info += 1; -+ } -+ log(TERM, LOG_INFO, "%d error in cpu core catched\n", num); -+ return num; -+} -+ -+static int ras_handle_cpu_error(struct trace_seq *s, -+ struct pevent_record *record, -+ struct event_format *event, -+ struct ras_arm_event *ev, time_t now) -+{ -+ unsigned long long val; -+ int cpu; -+ char *severity; -+ struct error_info err_info; -+ -+ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0) -+ return -1; -+ cpu = val; -+ trace_seq_printf(s, "\n cpu: %d", cpu); -+ -+ /* record cpu error */ -+ if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0) -+ return -1; -+ /* refer to UEFI_2_9 specification chapter N2.2 Table N-5 */ -+ switch (val) { -+ case GHES_SEV_NO: -+ severity = "Informational"; -+ break; -+ case GHES_SEV_CORRECTED: -+ severity = "Corrected"; -+ break; -+ case GHES_SEV_RECOVERABLE: -+ severity = "Recoverable"; -+ break; -+ default: -+ case GHES_SEV_PANIC: -+ severity = "Fatal"; -+ } -+ trace_seq_printf(s, "\n severity: %s", severity); -+ -+ if (val == GHES_SEV_CORRECTED) { -+ int nums = count_errors(ev); -+ -+ if (nums > 0) { -+ err_info.nums = nums; -+ err_info.time = now; -+ err_info.err_type = val; -+ ras_record_cpu_error(&err_info, cpu); -+ } -+ } -+ -+ return 0; -+} -+#endif -+ - int ras_arm_event_handler(struct trace_seq *s, - struct pevent_record *record, - struct event_format *event, void *context) -@@ -52,6 +143,7 @@ int ras_arm_event_handler(struct trace_seq *s, - struct tm *tm; - struct ras_arm_event ev; - int len = 0; -+ - memset(&ev, 0, sizeof(ev)); - - /* -@@ -139,6 +231,11 @@ int ras_arm_event_handler(struct trace_seq *s, - display_raw_data(s, ev.vsei_error, ev.oem_len); - #endif - -+#ifdef HAVE_CPU_FAULT_ISOLATION -+ if (ras_handle_cpu_error(s, record, event, &ev, now) < 0) -+ return -1; -+#endif -+ - /* Insert data into the SGBD */ - #ifdef HAVE_SQLITE3 - ras_store_arm_record(ras, &ev); -diff --git a/ras-arm-handler.h b/ras-arm-handler.h -index 563a2d3..52813e7 100644 ---- a/ras-arm-handler.h -+++ b/ras-arm-handler.h -@@ -17,6 +17,24 @@ - #include "ras-events.h" - #include "libtrace/event-parse.h" - -+/* -+ * ARM Processor Error Information Structure, According to -+ * UEFI_2_9 specification chapter N2.4.4. -+ */ -+#pragma pack(1) -+struct ras_arm_err_info { -+ uint8_t version; -+ uint8_t length; -+ uint16_t validation_bits; -+ uint8_t type; -+ uint16_t multiple_error; -+ uint8_t flags; -+ uint64_t error_info; -+ uint64_t virt_fault_addr; -+ uint64_t physical_fault_addr; -+}; -+#pragma pack() -+ - int ras_arm_event_handler(struct trace_seq *s, - struct pevent_record *record, - struct event_format *event, void *context); -diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c -new file mode 100644 -index 0000000..1694a08 ---- /dev/null -+++ b/ras-cpu-isolation.c -@@ -0,0 +1,388 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "ras-logger.h" -+#include "ras-cpu-isolation.h" -+ -+#define SECOND_OF_MON (30 * 24 * 60 * 60) -+#define SECOND_OF_DAY (24 * 60 * 60) -+#define SECOND_OF_HOU (60 * 60) -+#define SECOND_OF_MIN (60) -+ -+#define LIMIT_OF_CPU_THRESHOLD 10000 -+#define INIT_OF_CPU_THRESHOLD 18 -+#define DEC_CHECK 10 -+#define LAST_BIT_OF_UL 5 -+ -+static struct cpu_info *cpu_infos; -+static unsigned int ncores; -+static unsigned int enabled = 1; -+static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; -+ -+static const struct param normal_units[] = { -+ {"", 1}, -+ {} -+}; -+ -+static const struct param cycle_units[] = { -+ {"d", SECOND_OF_DAY}, -+ {"h", SECOND_OF_HOU}, -+ {"m", SECOND_OF_MIN}, -+ {"s", 1}, -+ {} -+}; -+ -+static struct isolation_param threshold = { -+ .name = "CPU_CE_THRESHOLD", -+ .units = normal_units, -+ .value = INIT_OF_CPU_THRESHOLD, -+ .limit = LIMIT_OF_CPU_THRESHOLD -+}; -+ -+static struct isolation_param cpu_limit = { -+ .name = "CPU_ISOLATION_LIMIT", -+ .units = normal_units -+}; -+ -+static struct isolation_param cycle = { -+ .name = "CPU_ISOLATION_CYCLE", -+ .units = cycle_units, -+ .value = SECOND_OF_DAY, -+ .limit = SECOND_OF_MON -+}; -+ -+static const char * const cpu_state[] = { -+ [CPU_OFFLINE] = "offline", -+ [CPU_ONLINE] = "online", -+ [CPU_OFFLINE_FAILED] = "offline-failed", -+ [CPU_UNKNOWN] = "unknown" -+}; -+ -+static int open_sys_file(unsigned int cpu, int __oflag, const char *format) -+{ -+ int fd; -+ char path[PATH_MAX] = ""; -+ char real_path[PATH_MAX] = ""; -+ -+ snprintf(path, sizeof(path), format, cpu); -+ if (strlen(path) > PATH_MAX || realpath(path, real_path) == NULL) { -+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path); -+ return -1; -+ } -+ fd = open(real_path, __oflag); -+ if (fd == -1) { -+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, real_path); -+ return -1; -+ } -+ -+ return fd; -+} -+ -+static int get_cpu_status(unsigned int cpu) -+{ -+ int fd, num; -+ char buf[2] = ""; -+ -+ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format); -+ if (fd == -1) -+ return CPU_UNKNOWN; -+ -+ if (read(fd, buf, 1) <= 0 || sscanf(buf, "%d", &num) != 1) -+ num = CPU_UNKNOWN; -+ -+ close(fd); -+ -+ return (num < 0 || num > CPU_UNKNOWN) ? CPU_UNKNOWN : num; -+} -+ -+static int init_cpu_info(unsigned int cpus) -+{ -+ ncores = cpus; -+ cpu_infos = (struct cpu_info *)malloc(sizeof(*cpu_infos) * cpus); -+ if (!cpu_infos) { -+ log(TERM, LOG_ERR, -+ "Failed to allocate memory for cpu infos in %s.\n", __func__); -+ return -1; -+ } -+ -+ for (unsigned int i = 0; i < cpus; ++i) { -+ cpu_infos[i].ce_nums = 0; -+ cpu_infos[i].state = get_cpu_status(i); -+ cpu_infos[i].ce_queue = init_queue(); -+ -+ if (cpu_infos[i].ce_queue == NULL) { -+ log(TERM, LOG_ERR, -+ "Failed to allocate memory for cpu ce queue in %s.\n", __func__); -+ return -1; -+ } -+ } -+ /* set limit of offlined cpu limit according to number of cpu */ -+ cpu_limit.limit = cpus - 1; -+ cpu_limit.value = 0; -+ -+ return 0; -+} -+ -+static void check_config(struct isolation_param *config) -+{ -+ if (config->value > config->limit) { -+ log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n", -+ config->value, config->limit); -+ config->value = config->limit; -+ } -+} -+ -+static int parse_ul_config(struct isolation_param *config, char *env, unsigned long *value) -+{ -+ char *unit = NULL; -+ int env_size, has_unit = 0; -+ -+ if (!env || strlen(env) == 0) -+ return -1; -+ -+ env_size = strlen(env); -+ unit = env + env_size - 1; -+ -+ if (isalpha(*unit)) { -+ has_unit = 1; -+ env_size--; -+ if (env_size <= 0) -+ return -1; -+ } -+ -+ for (int i = 0; i < env_size; ++i) { -+ if (isdigit(env[i])) { -+ if (*value > ULONG_MAX / DEC_CHECK || -+ (*value == ULONG_MAX / DEC_CHECK && env[i] - '0' > LAST_BIT_OF_UL)) { -+ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); -+ return -1; -+ } -+ *value = DEC_CHECK * (*value) + (env[i] - '0'); -+ } else -+ return -1; -+ } -+ -+ if (!has_unit) -+ return 0; -+ -+ for (const struct param *units = config->units; units->name; units++) { -+ /* value character and unit character are both valid */ -+ if (!strcasecmp(unit, units->name)) { -+ if (*value > (ULONG_MAX / units->value)) { -+ log(TERM, LOG_ERR, -+ "%s is out of range: %lu\n", env, ULONG_MAX); -+ return -1; -+ } -+ *value = (*value) * units->value; -+ return 0; -+ } -+ } -+ log(TERM, LOG_ERR, "Invalid unit %s\n", unit); -+ return -1; -+} -+ -+static void init_config(struct isolation_param *config) -+{ -+ char *env = getenv(config->name); -+ unsigned long value = 0; -+ -+ if (parse_ul_config(config, env, &value) < 0) { -+ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %lu.\n", -+ config->name, env, config->value); -+ return; -+ } -+ -+ config->value = value; -+ check_config(config); -+} -+ -+static int check_config_status(void) -+{ -+ char *env = getenv("CPU_ISOLATION_ENABLE"); -+ -+ if (env == NULL || strcasecmp(env, "yes")) -+ return -1; -+ -+ return 0; -+} -+ -+void ras_cpu_isolation_init(unsigned int cpus) -+{ -+ if (init_cpu_info(cpus) < 0 || check_config_status() < 0) { -+ enabled = 0; -+ log(TERM, LOG_WARNING, "Cpu fault isolation is disabled\n"); -+ return; -+ } -+ -+ log(TERM, LOG_INFO, "Cpu fault isolation is enabled\n"); -+ init_config(&threshold); -+ init_config(&cpu_limit); -+ init_config(&cycle); -+} -+ -+void cpu_infos_free(void) -+{ -+ if (cpu_infos) { -+ for (int i = 0; i < ncores; ++i) -+ free_queue(cpu_infos[i].ce_queue); -+ -+ free(cpu_infos); -+ } -+} -+ -+static int do_cpu_offline(unsigned int cpu) -+{ -+ int fd, rc; -+ char buf[2] = ""; -+ -+ cpu_infos[cpu].state = CPU_OFFLINE_FAILED; -+ fd = open_sys_file(cpu, O_RDWR, cpu_path_format); -+ if (fd == -1) -+ return HANDLE_FAILED; -+ -+ strcpy(buf, "0"); -+ rc = write(fd, buf, strlen(buf)); -+ if (rc < 0) { -+ log(TERM, LOG_ERR, "cpu%u offline failed, errno:%d\n", cpu, errno); -+ close(fd); -+ return HANDLE_FAILED; -+ } -+ -+ close(fd); -+ /* check wthether the cpu is isolated successfully */ -+ cpu_infos[cpu].state = get_cpu_status(cpu); -+ -+ if (cpu_infos[cpu].state == CPU_OFFLINE) -+ return HANDLE_SUCCEED; -+ -+ return HANDLE_FAILED; -+} -+ -+static int do_ce_handler(unsigned int cpu) -+{ -+ struct link_queue *queue = cpu_infos[cpu].ce_queue; -+ unsigned int tmp; -+ /* -+ * Since we just count all error numbers in setted cycle, we store the time -+ * and error numbers from current event to the queue, then everytime we -+ * calculate the period from beginning time to ending time, if the period -+ * exceeds setted cycle, we pop the beginning time and error until the period -+ * from new beginning time to ending time is less than cycle. -+ */ -+ while (queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) { -+ tmp = queue->head->value; -+ if (pop(queue) == 0) -+ cpu_infos[cpu].ce_nums -= tmp; -+ } -+ log(TERM, LOG_INFO, -+ "Current number of Corrected Errors in cpu%d in the cycle is %lu\n", -+ cpu, cpu_infos[cpu].ce_nums); -+ -+ if (cpu_infos[cpu].ce_nums >= threshold.value) { -+ log(TERM, LOG_INFO, -+ "Corrected Errors exceeded threshold %lu, try to offline cpu%u\n", -+ threshold.value, cpu); -+ return do_cpu_offline(cpu); -+ } -+ return HANDLE_NOTHING; -+} -+ -+static int error_handler(unsigned int cpu, struct error_info *err_info) -+{ -+ int ret = HANDLE_NOTHING; -+ -+ switch (err_info->err_type) { -+ case CE: -+ ret = do_ce_handler(cpu); -+ break; -+ default: -+ break; -+ } -+ -+ return ret; -+} -+ -+static void record_error_info(unsigned int cpu, struct error_info *err_info) -+{ -+ switch (err_info->err_type) { -+ case CE: -+ { -+ struct queue_node *node = node_create(err_info->time, err_info->nums); -+ -+ if (node == NULL) { -+ log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n"); -+ return; -+ } -+ push(cpu_infos[cpu].ce_queue, node); -+ cpu_infos[cpu].ce_nums += err_info->nums; -+ break; -+ } -+ default: -+ break; -+ } -+} -+ -+void ras_record_cpu_error(struct error_info *err_info, int cpu) -+{ -+ int ret; -+ -+ if (enabled == 0) -+ return; -+ -+ if (cpu >= ncores || cpu < 0) { -+ log(TERM, LOG_ERR, -+ "The current cpu %d has exceed the total number of cpu:%u\n", cpu, ncores); -+ return; -+ } -+ -+ log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu); -+ cpu_infos[cpu].state = get_cpu_status(cpu); -+ -+ if (cpu_infos[cpu].state != CPU_ONLINE) { -+ log(TERM, LOG_INFO, "Cpu%d is not online or unknown, ignore\n", cpu); -+ return; -+ } -+ -+ record_error_info(cpu, err_info); -+ /* -+ * Since user may change cpu state, we get current offlined -+ * cpu numbers every recording time. -+ */ -+ if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) { -+ log(TERM, LOG_WARNING, -+ "Offlined cpus have exceeded limit: %lu, choose to do nothing\n", -+ cpu_limit.value); -+ return; -+ } -+ -+ ret = error_handler(cpu, err_info); -+ if (ret == HANDLE_NOTHING) -+ log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu); -+ else if (ret == HANDLE_SUCCEED) { -+ log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n", -+ cpu, cpu_state[cpu_infos[cpu].state]); -+ clear_queue(cpu_infos[cpu].ce_queue); -+ cpu_infos[cpu].ce_nums = 0; -+ } else -+ log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n", -+ cpu, cpu_state[cpu_infos[cpu].state]); -+} -diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h -new file mode 100644 -index 0000000..35b5225 ---- /dev/null -+++ b/ras-cpu-isolation.h -@@ -0,0 +1,67 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#ifndef __RAS_CPU_ISOLATION_H -+#define __RAS_CPU_ISOLATION_H -+ -+#include "queue.h" -+ -+#define MAX_BUF_LEN 1024 -+ -+struct param { -+ char *name; -+ unsigned long value; -+}; -+ -+struct isolation_param { -+ char *name; -+ const struct param *units; -+ unsigned long value; -+ unsigned long limit; -+}; -+ -+enum cpu_state { -+ CPU_OFFLINE, -+ CPU_ONLINE, -+ CPU_OFFLINE_FAILED, -+ CPU_UNKNOWN, -+}; -+ -+enum error_handle_result { -+ HANDLE_FAILED = -1, -+ HANDLE_SUCCEED, -+ HANDLE_NOTHING, -+}; -+ -+enum error_type { -+ CE = 1 -+}; -+ -+struct cpu_info { -+ unsigned long ce_nums; -+ struct link_queue *ce_queue; -+ enum cpu_state state; -+}; -+ -+struct error_info { -+ unsigned long nums; -+ time_t time; -+ enum error_type err_type; -+}; -+ -+void ras_cpu_isolation_init(unsigned int cpus); -+void ras_record_cpu_error(struct error_info *err_info, int cpu); -+void cpu_infos_free(void); -+ -+#endif -diff --git a/ras-events.c b/ras-events.c -index fe4bd26..2a7d709 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -41,6 +41,7 @@ - #include "ras-record.h" - #include "ras-logger.h" - #include "ras-page-isolation.h" -+#include "ras-cpu-isolation.h" - - /* - * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -@@ -855,6 +856,10 @@ int handle_ras_events(int record_events) - - cpus = get_num_cpus(ras); - -+#ifdef HAVE_CPU_FAULT_ISOLATION -+ ras_cpu_isolation_init(cpus); -+#endif -+ - #ifdef HAVE_MCE - rc = register_mce_handler(ras, cpus); - if (rc) -@@ -981,6 +986,8 @@ err: - } - free(ras); - } -- -+#ifdef HAVE_CPU_FAULT_ISOLATION -+ cpu_infos_free(); -+#endif - return rc; - } --- -2.33.1 - diff --git a/1003-rasdaemon-trace-SIGBUS-event-for-hardware-error.patch b/1003-rasdaemon-trace-SIGBUS-event-for-hardware-error.patch new file mode 100644 index 0000000000000000000000000000000000000000..1474b7d2c9bd9c96be286d64cb20fac267baa4bf --- /dev/null +++ b/1003-rasdaemon-trace-SIGBUS-event-for-hardware-error.patch @@ -0,0 +1,734 @@ +From e14173ad86ac94b9e4af84eaddb1abe3bc6410b7 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 18 Mar 2025 15:25:09 +0800 +Subject: [PATCH] rasdaemon: trace SIGBUS event for hardware error + +Kernel will send SIGBUS to program when read DE/UE, use rasdaemon to +catch this SIGBUS and print it like follow: + <...>-71085 [056] d... 0.007781 signal_generate \ + 2025-03-18 15:24:11 +0800 signal: Bus error, errorno: 0, code: 4, \ + comm: einj_mem_uc, pid: 71085, grp: 0, res: Deliverd, \ + msg: Hardware memory error consumed: action required + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 5 +- + configure.ac | 11 ++++ + ras-events.c | 27 +++++++- + ras-events.h | 1 + + ras-record.c | 75 +++++++++++++++++++++++ + ras-record.h | 20 ++++++ + ras-report.c | 82 +++++++++++++++++++++++++ + ras-report.h | 6 +- + ras-signal-handler.c | 143 +++++++++++++++++++++++++++++++++++++++++++ + ras-signal-handler.h | 30 +++++++++ + util/ras-mc-ctl.in | 42 ++++++++++++- + 11 files changed, 438 insertions(+), 4 deletions(-) + create mode 100644 ras-signal-handler.c + create mode 100644 ras-signal-handler.h + +diff --git a/Makefile.am b/Makefile.am +index e3e66bb..1306d97 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -112,6 +112,9 @@ endif + if WITH_JAGUAR_NS_DECODE + rasdaemon_SOURCES += non-standard-jaguarmicro.c + endif ++if WITH_SIGNAL ++ rasdaemon_SOURCES += ras-signal-handler.c ++endif + + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) + rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) +@@ -122,7 +125,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ + ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ +- non-standard-jaguarmicro.h trigger.h unified-sel.h ++ non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 1cb00b6..25e0cb2 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -244,6 +244,16 @@ AS_IF([test "x$enable_yitian_ns_decode" = "xyes" || test "x$enable_all" == "xyes + AM_CONDITIONAL([WITH_YITIAN_NS_DECODE], [test x$enable_yitian_ns_decode = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_YITIAN_NS_DECODE], [USE_YITIAN_NS_DECODE="yes"], [USE_YITIAN_NS_DECODE="no"]) + ++AC_ARG_ENABLE([signal], ++ AS_HELP_STRING([--enable-signal], [enable signal event(currently experimental)])) ++ ++AS_IF([test "x$enable_signal" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_SIGNAL,1,"have signal event") ++ AC_SUBST([WITH_SIGNAL]) ++]) ++AM_CONDITIONAL([WITH_SIGNAL], [test x$enable_signal = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_SIGNAL], [USE_SIGNAL="yes"], [USE_SIGNAL="no"]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -290,4 +300,5 @@ compile time options summary + CPU fault isolation : $USE_CPU_FAULT_ISOLATION + YITIAN RAS errors : $USE_YITIAN_NS_DECODE + JAGUAR RAS errors : $USE_JAGUAR_NS_DECODE ++ Signal : $USE_SIGNAL + EOF +diff --git a/ras-events.c b/ras-events.c +index 6692a31..2220e9a 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -34,6 +34,7 @@ + #include "ras-memory-failure-handler.h" + #include "ras-non-standard-handler.h" + #include "ras-page-isolation.h" ++#include "ras-signal-handler.h" + #include "ras-record.h" + #include "trigger.h" + +@@ -315,6 +316,10 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_memory_module", enable); + #endif + ++#ifdef HAVE_SIGNAL ++ rc |= __toggle_ras_mc_event(ras, "signal", "signal_generate", enable); ++#endif ++ + free_ras: + free(ras); + if (rc) +@@ -335,7 +340,7 @@ static void setup_event_trigger(char *event) + } + + #ifdef HAVE_DISKERROR +-#ifndef HAVE_BLK_RQ_ERROR ++#if (!defined(HAVE_BLK_RQ_ERROR)) || defined(HAVE_SIGNAL) + /* + * Set kernel filter. libtrace doesn't provide an API for setting filters + * in kernel, we have to implement it here. +@@ -943,6 +948,10 @@ int handle_ras_events(int record_events, int enable_ipmitool) + #ifdef HAVE_DEVLINK + char *filter_str = NULL; + #endif ++#ifdef HAVE_SIGNAL ++ char signal_filter[64]; ++#endif ++ + + ras = calloc(1, sizeof(*ras)); + if (!ras) { +@@ -1173,6 +1182,22 @@ int handle_ras_events(int record_events, int enable_ipmitool) + "cxl", "memory_module"); + #endif + ++#ifdef HAVE_SIGNAL ++ snprintf(signal_filter, sizeof(signal_filter), "sig == %d && code >= %d", SIGBUS, BUS_OBJERR); ++ // ensure filter enabled ++ usleep(30000); ++ rc = filter_ras_mc_event(ras, "signal", "signal_generate", signal_filter); ++ if (!rc) { ++ rc = add_event_handler(ras, pevent, page_size, "signal", "signal_generate", ++ ras_signal_event_handler, NULL, SIGNAL_EVENT); ++ if (!rc) ++ num_events++; ++ else if (rc != -EINVAL) ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "signal", "signal_generate"); ++ } ++#endif ++ + if (!num_events) { + log(ALL, LOG_INFO, + "Failed to trace any supported RAS events. Aborting.\n"); +diff --git a/ras-events.h b/ras-events.h +index 83d41df..1689a12 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -35,6 +35,7 @@ enum { + CXL_GENERAL_MEDIA_EVENT, + CXL_DRAM_EVENT, + CXL_MEMORY_MODULE_EVENT, ++ SIGNAL_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index eed7aca..31a93a4 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -1142,6 +1142,61 @@ int ras_store_cxl_memory_module_event(struct ras_events *ras, + } + #endif + ++#ifdef HAVE_SIGNAL ++static const struct db_fields signal_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "sig", .type = "INTEGER" }, ++ { .name = "errorno", .type = "INTEGER" }, ++ { .name = "code", .type = "INTEGER" }, ++ { .name = "comm", .type = "TEXT" }, ++ { .name = "pid", .type = "INTEGER" }, ++ { .name = "grp", .type = "INTEGER" }, ++ { .name = "res", .type = "INTEGER" }, ++ ++}; ++ ++static const struct db_table_descriptor signal_event_tab = { ++ .name = "signal_event", ++ .fields = signal_event_fields, ++ .num_fields = ARRAY_SIZE(signal_event_fields), ++}; ++ ++int ras_store_signal_event(struct ras_events *ras, struct ras_signal_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_signal_event) ++ return -1; ++ log(TERM, LOG_INFO, "signal_event store: %p\n", priv->stmt_signal_event); ++ ++ sqlite3_bind_text(priv->stmt_signal_event, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_int(priv->stmt_signal_event, 2, ev->sig); ++ sqlite3_bind_int(priv->stmt_signal_event, 3, ev->error_no); ++ sqlite3_bind_int(priv->stmt_signal_event, 4, ev->code); ++ sqlite3_bind_text(priv->stmt_signal_event, 5, ev->comm, -1, NULL); ++ sqlite3_bind_int(priv->stmt_signal_event, 6, ev->pid); ++ sqlite3_bind_int(priv->stmt_signal_event, 7, ev->group); ++ sqlite3_bind_int(priv->stmt_signal_event, 8, ev->result); ++ ++ rc = sqlite3_step(priv->stmt_signal_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do signal_event step on sqlite: error = %d\n", rc); ++ ++ rc = sqlite3_reset(priv->stmt_signal_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset signal_event on sqlite: error = %d\n", ++ rc); ++ ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} ++#endif ++ + /* + * Generic code + */ +@@ -1550,6 +1605,16 @@ int ras_mc_event_opendb(unsigned int cpu, struct ras_events *ras) + } + #endif + ++#ifdef HAVE_SIGNAL ++ rc = ras_mc_create_table(priv, &signal_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_signal_event, ++ &signal_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } ++#endif ++ + ras->db_priv = priv; + return 0; + +@@ -1734,6 +1799,16 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + } + #endif + ++#ifdef HAVE_SIGNAL ++ if (priv->stmt_signal_event) { ++ rc = sqlite3_finalize(priv->stmt_signal_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize signal_event sqlite: error = %d\n", ++ cpu, rc); ++ } ++#endif ++ + rc = sqlite3_close_v2(db); + if (rc != SQLITE_OK) + log(TERM, LOG_ERR, +diff --git a/ras-record.h b/ras-record.h +index eec0702..2dd6630 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -9,6 +9,7 @@ + #define __RAS_RECORD_H + + #include ++#include + #include + #include + +@@ -258,6 +259,17 @@ struct ras_cxl_memory_module_event { + uint8_t res_id[CXL_PLDM_RES_ID_LEN]; + }; + ++struct ras_signal_event { ++ char timestamp[64]; ++ int sig; ++ int error_no; ++ int code; ++ char *comm; ++ pid_t pid; ++ int group; ++ int result; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -275,6 +287,7 @@ struct ras_cxl_generic_event; + struct ras_cxl_general_media_event; + struct ras_cxl_dram_event; + struct ras_cxl_memory_module_event; ++struct ras_signal_event; + + #ifdef HAVE_SQLITE3 + +@@ -315,6 +328,9 @@ struct sqlite3_priv { + sqlite3_stmt *stmt_cxl_dram_event; + sqlite3_stmt *stmt_cxl_memory_module_event; + #endif ++#ifdef HAVE_SIGNAL ++ sqlite3_stmt *stmt_signal_event; ++#endif + }; + + struct db_fields { +@@ -361,6 +377,8 @@ int ras_store_cxl_dram_event(struct ras_events *ras, + struct ras_cxl_dram_event *ev); + int ras_store_cxl_memory_module_event(struct ras_events *ras, + struct ras_cxl_memory_module_event *ev); ++int ras_store_signal_event(struct ras_events *ras, ++ struct ras_signal_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned int cpu, +@@ -401,6 +419,8 @@ static inline int ras_store_cxl_dram_event(struct ras_events *ras, + struct ras_cxl_dram_event *ev) { return 0; }; + static inline int ras_store_cxl_memory_module_event(struct ras_events *ras, + struct ras_cxl_memory_module_event *ev) { return 0; }; ++static inline int ras_store_signal_event(struct ras_events *ras, ++ struct ras_signal_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 4535421..35d2792 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -13,6 +13,7 @@ + #include + + #include "ras-report.h" ++#include "ras-record.h" + + static int setup_report_socket(void) + { +@@ -735,6 +736,37 @@ static int set_cxl_memory_module_event_backtrace(char *buf, struct ras_cxl_memor + return 0; + } + ++static int set_signal_event_backtrace(char *buf, struct ras_signal_event *ev) ++{ ++ unsigned int size = MAX_BACKTRACE_SIZE; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ while (*buf && size > 0) { ++ buf++; ++ size--; ++ } ++ ++ snprintf(buf, size, "BACKTRACE=" ++ "timestamp=%s\n" ++ "signal=%d\n" ++ "errorno=%d\n" ++ "code=%d\n" ++ "comm=%s\n" ++ "grp=%d\n" ++ "res=%d\n", ++ ev->timestamp, ++ ev->sig, ++ ev->error_no, ++ ev->code, ++ ev->comm, ++ ev->group, ++ ev->result); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev) + { + char buf[MAX_BACKTRACE_SIZE]; +@@ -812,6 +844,10 @@ static int commit_report_backtrace(int sockfd, int type, void *ev) + rc = set_cxl_memory_module_event_backtrace(buf, + (struct ras_cxl_memory_module_event *)ev); + break; ++ case SIGNAL_EVENT: ++ rc = set_signal_event_backtrace(buf, ++ (struct ras_signal_event *)ev); ++ break; + default: + return -1; + } +@@ -1552,3 +1588,49 @@ cxl_memory_module_fail: + + return -1; + } ++ ++int ras_report_signal_event(struct ras_events *ras, ++ struct ras_signal_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto signal_fail; ++ ++ rc = commit_report_backtrace(sockfd, SIGNAL_EVENT, ev); ++ if (rc < 0) ++ goto signal_fail; ++ ++ snprintf(buf, MAX_MESSAGE_SIZE, "ANALYZER=%s", ++ "rasdaemon-signal_event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto signal_fail; ++ ++ snprintf(buf, MAX_MESSAGE_SIZE, "REASON=%s", "SIGBUS for Hardware error"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto signal_fail; ++ ++ done = 1; ++ ++signal_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index ceb64ce..f680a25 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -57,6 +57,8 @@ int ras_report_cxl_dram_event(struct ras_events *ras, + struct ras_cxl_dram_event *ev); + int ras_report_cxl_memory_module_event(struct ras_events *ras, + struct ras_cxl_memory_module_event *ev); ++int ras_report_signal_event(struct ras_events *ras, ++ struct ras_signal_event *ev); + + #else + +@@ -108,7 +110,9 @@ static inline int ras_report_cxl_dram_event(struct ras_events *ras, + static inline int ras_report_cxl_memory_module_event(struct ras_events *ras, + struct ras_cxl_memory_module_event *ev) + { return 0; }; +- ++static inline int ras_report_signal_event(struct ras_events *ras, ++ struct ras_signal_event *ev) ++{ return 0; }; + #endif + + #endif +diff --git a/ras-signal-handler.c b/ras-signal-handler.c +new file mode 100644 +index 0000000..fb0bfd3 +--- /dev/null ++++ b/ras-signal-handler.c +@@ -0,0 +1,143 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2025 Ruidong Tian ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++#define _GNU_SOURCE ++#include ++#include ++#include ++#include ++ ++#include "ras-signal-handler.h" ++#include "ras-report.h" ++#include "types.h" ++ ++enum { ++ TRACE_SIGNAL_DELIVERED, ++ TRACE_SIGNAL_IGNORED, ++ TRACE_SIGNAL_ALREADY_PENDING, ++ TRACE_SIGNAL_OVERFLOW_FAIL, ++ TRACE_SIGNAL_LOSE_INFO, ++}; ++ ++static char *signal_msg[] = { ++ [BUS_ADRALN] = "invalid address alignment", ++ [BUS_ADRERR] = "non-existent address", ++ [BUS_OBJERR] = "object-specific hardware error", ++ [BUS_MCEERR_AR] = "Hardware memory error consumed: action required", ++ [BUS_MCEERR_AO] = "Hardware memory error detected in process but not consumed: action optional", ++}; ++ ++static char *errcode_str[] = { ++ [BUS_ADRALN] = "BUS_ADRALN", ++ [BUS_ADRERR] = "BUS_ADRERR", ++ [BUS_OBJERR] = "BUS_OBJERR", ++ [BUS_MCEERR_AR] = "BUS_MCEERR_AR", ++ [BUS_MCEERR_AO] = "BUS_MCEERR_AO", ++}; ++ ++static char *signal_res[] = { ++ [TRACE_SIGNAL_DELIVERED] = "Delivered", ++ [TRACE_SIGNAL_IGNORED] = "Ignore", ++ [TRACE_SIGNAL_ALREADY_PENDING] = "Already pending", ++ [TRACE_SIGNAL_OVERFLOW_FAIL] = "Overflow fail", ++ [TRACE_SIGNAL_LOSE_INFO] = "Lose info", ++}; ++ ++static void report_ras_signal_event(struct trace_seq *s, struct ras_signal_event *ev) ++{ ++ trace_seq_printf(s, ++ "%s signal: %s, errorno: %d, code: %s, comm: %s, pid: %d, grp: %d, res: %s, msg: %s", ++ ev->timestamp, strsignal(ev->sig), ev->error_no, ++ (ev->code < 0 || ev->code > BUS_MCEERR_AO) ? "Unknown" : errcode_str[ev->code], ++ ev->comm, ev->pid, ++ ev->group, ++ (ev->result < 0 || ev->result > TRACE_SIGNAL_LOSE_INFO) ? "Unknown" : signal_res[ev->result], ++ ev->sig == SIGBUS ? signal_msg[ev->code] : "Unknown"); ++} ++ ++int ras_signal_event_handler(struct trace_seq *s, struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ time_t now; ++ struct tm *tm; ++ struct ras_signal_event ev; ++ ++ /* ++ * Newer kernels (3.10-rc1 or upper) provide an uptime clock. ++ * On previous kernels, the way to properly generate an event would ++ * be to inject a fake one, measure its timestamp and diff it against ++ * gettimeofday. We won't do it here. Instead, let's use uptime, ++ * falling-back to the event report's time, if "uptime" clock is ++ * not available (legacy kernels). ++ */ ++ ++ if (ras->use_uptime) ++ now = record->ts / user_hz + ras->uptime_diff; ++ else ++ now = time(NULL); ++ ++ tm = localtime(&now); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ ++ if (tep_get_field_val(s, event, "sig", record, &val, 1) < 0) ++ return -1; ++ ev.sig = val; ++ ++ if (tep_get_field_val(s, event, "errno", record, &val, 1) < 0) ++ return -1; ++ ev.error_no = val; ++ ++ if (tep_get_field_val(s, event, "code", record, &val, 1) < 0) ++ return -1; ++ ev.code = val; ++ ++ ev.comm = tep_get_field_raw(s, event, "comm", record, &len, 1); ++ if (!ev.comm) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "pid", record, &val, 1) < 0) ++ return -1; ++ ev.pid = val; ++ ++ if (tep_get_field_val(s, event, "group", record, &val, 1) < 0) ++ return -1; ++ ev.group = val; ++ ++ if (tep_get_field_val(s, event, "result", record, &val, 1) < 0) ++ return -1; ++ ev.result = val; ++ ++ report_ras_signal_event(s, &ev); ++ ++ /* Store data into the SQLite DB */ ++#ifdef HAVE_SQLITE3 ++ ras_store_signal_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_signal_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-signal-handler.h b/ras-signal-handler.h +new file mode 100644 +index 0000000..9740c61 +--- /dev/null ++++ b/ras-signal-handler.h +@@ -0,0 +1,30 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (C) 2025 Ruidong Tian ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef __RAS_SIGNAL_HANDLER_H ++#define __RAS_SIGNAL_HANDLER_H ++ ++#include ++ ++#include "ras-events.h" ++ ++int ras_signal_event_handler(struct trace_seq *s, struct tep_record *record, ++ struct tep_event *event, void *context); ++ ++#endif +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index ba48660..648517f 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -35,6 +35,7 @@ my $has_disk_errors = 0; + my $has_extlog = 0; + my $has_mem_failure = 0; + my $has_mce = 0; ++my $has_signal = 0; + + @WITH_AER_TRUE@$has_aer = 1; + @WITH_ARM_TRUE@$has_arm = 1; +@@ -44,6 +45,7 @@ my $has_mce = 0; + @WITH_EXTLOG_TRUE@$has_extlog = 1; + @WITH_MEMORY_FAILURE_TRUE@$has_mem_failure = 1; + @WITH_MCE_TRUE@$has_mce = 1; ++@WITH_SIGNAL_TRUE@$has_signal = 1; + + my %conf = (); + my %bus = (); +@@ -1546,7 +1548,7 @@ sub summary + { + require DBI; + my ($query, $query_handle, $out); +- my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result); ++ my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result, $sigcode); + my ($etype, $severity, $etype_string, $severity_string); + my ($dev_name, $dev); + my ($mpidr, $memdev); +@@ -1828,6 +1830,24 @@ sub summary + $query_handle->finish; + } + ++ # Signal event ++ if ($has_signal == 1) { ++ $query = "select code, count(*) from signal_event$conf{opt}{since} group by code"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($sigcode, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$sigcode errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "SIGNAL events summary:\n$out\n"; ++ } else { ++ print "No SIGNAL.\n\n"; ++ } ++ $query_handle->finish; ++ } ++ + undef($dbh); + } + +@@ -1849,6 +1869,7 @@ sub errors + my ($nibble_mask, $bank_group, $row, $column, $cor_mask); + my ($event_type, $event_sub_type, $health_status, $media_status, $life_used, $dirty_shutdown_cnt, $cor_vol_err_cnt, $cor_per_err_cnt, $device_temp, $add_status); + my ($sub_type, $sub_channel, $cme_threshold_ev_flags, $cme_count, $cvme_count); ++ my ($signal, $errorno, $code, $comm, $pid, $grp, $res); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -2366,6 +2387,25 @@ sub errors + $query_handle->finish; + } + ++ # SIGNAL event ++ if ($has_signal == 1) { ++ $query = "select id, timestamp, signal, errorno, code, comm, pid, grp, res from signal_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $signal, $errorno, $code, $comm, $pid, $grp, $res)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "signal=$signal, errorno=$errorno, code=$code, comm=$comm, pid=$pid, grp=$grp, res=$res\n"; ++ } ++ if ($out ne "") { ++ print "SIGNAL events:\n$out\n"; ++ } else { ++ print "No SIGNAL event.\n\n"; ++ } ++ $query_handle->finish; ++ } ++ + undef($dbh); + } + +-- +2.43.5 + diff --git a/1004-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch b/1004-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch deleted file mode 100644 index 88a1d1b48139a377b87b3258eaccb1d03be81548..0000000000000000000000000000000000000000 --- a/1004-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch +++ /dev/null @@ -1,151 +0,0 @@ -From 4b72881f3b264a4268a36741c8568f922557c4b4 Mon Sep 17 00:00:00 2001 -From: Shengwei Luo -Date: Wed, 23 Feb 2022 17:23:27 +0800 -Subject: [PATCH 04/85] rasdaemon: Support cpu fault isolation for recoverable - errors - -When the recoverable errors in cpu core occurred, try to offline -the related cpu core. - -Signed-off-by: Shengwei Luo -Signed-off-by: Junchong Pan -Signed-off-by: Lei Feng -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - ras-arm-handler.c | 22 +++++++++++++++++++--- - ras-cpu-isolation.c | 17 +++++++++++++++++ - ras-cpu-isolation.h | 4 +++- - 3 files changed, 39 insertions(+), 4 deletions(-) - -diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 9c7a3c3..a0dfc51 100644 ---- a/ras-arm-handler.c -+++ b/ras-arm-handler.c -@@ -26,6 +26,7 @@ - - #define ARM_ERR_VALID_ERROR_COUNT BIT(0) - #define ARM_ERR_VALID_FLAGS BIT(1) -+#define BIT2 2 - - void display_raw_data(struct trace_seq *s, - const uint8_t *buf, -@@ -47,7 +48,20 @@ void display_raw_data(struct trace_seq *s, - } - - #ifdef HAVE_CPU_FAULT_ISOLATION --static int count_errors(struct ras_arm_event *ev) -+static int is_core_failure(struct ras_arm_err_info *err_info) -+{ -+ if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) { -+ /* -+ * core failure: -+ * Bit 0\1\3: (at lease 1) -+ * Bit 2: 0 -+ */ -+ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << BIT2)); -+ } -+ return 0; -+} -+ -+static int count_errors(struct ras_arm_event *ev, int sev) - { - struct ras_arm_err_info *err_info; - int num_pei; -@@ -75,6 +89,8 @@ static int count_errors(struct ras_arm_event *ev) - */ - error_count = err_info->multiple_error + 1; - } -+ if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info)) -+ error_count = 0; - - num += error_count; - err_info += 1; -@@ -118,8 +134,8 @@ static int ras_handle_cpu_error(struct trace_seq *s, - } - trace_seq_printf(s, "\n severity: %s", severity); - -- if (val == GHES_SEV_CORRECTED) { -- int nums = count_errors(ev); -+ if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) { -+ int nums = count_errors(ev, val); - - if (nums > 0) { - err_info.nums = nums; -diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c -index 1694a08..90633fd 100644 ---- a/ras-cpu-isolation.c -+++ b/ras-cpu-isolation.c -@@ -126,6 +126,7 @@ static int init_cpu_info(unsigned int cpus) - - for (unsigned int i = 0; i < cpus; ++i) { - cpu_infos[i].ce_nums = 0; -+ cpu_infos[i].uce_nums = 0; - cpu_infos[i].state = get_cpu_status(i); - cpu_infos[i].ce_queue = init_queue(); - -@@ -306,6 +307,15 @@ static int do_ce_handler(unsigned int cpu) - return HANDLE_NOTHING; - } - -+static int do_uce_handler(unsigned int cpu) -+{ -+ if (cpu_infos[cpu].uce_nums > 0) { -+ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%u\n", cpu); -+ return do_cpu_offline(cpu); -+ } -+ return HANDLE_NOTHING; -+} -+ - static int error_handler(unsigned int cpu, struct error_info *err_info) - { - int ret = HANDLE_NOTHING; -@@ -314,6 +324,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info) - case CE: - ret = do_ce_handler(cpu); - break; -+ case UCE: -+ ret = do_uce_handler(cpu); -+ break; - default: - break; - } -@@ -336,6 +349,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info) - cpu_infos[cpu].ce_nums += err_info->nums; - break; - } -+ case UCE: -+ cpu_infos[cpu].uce_nums++; -+ break; - default: - break; - } -@@ -382,6 +398,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) - cpu, cpu_state[cpu_infos[cpu].state]); - clear_queue(cpu_infos[cpu].ce_queue); - cpu_infos[cpu].ce_nums = 0; -+ cpu_infos[cpu].uce_nums = 0; - } else - log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n", - cpu, cpu_state[cpu_infos[cpu].state]); -diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h -index 35b5225..5682106 100644 ---- a/ras-cpu-isolation.h -+++ b/ras-cpu-isolation.h -@@ -45,10 +45,12 @@ enum error_handle_result { - }; - - enum error_type { -- CE = 1 -+ CE = 1, -+ UCE - }; - - struct cpu_info { -+ unsigned long uce_nums; - unsigned long ce_nums; - struct link_queue *ce_queue; - enum cpu_state state; --- -2.33.1 - diff --git a/1004-rasdaemon-align-event-name-in-log.patch b/1004-rasdaemon-align-event-name-in-log.patch new file mode 100644 index 0000000000000000000000000000000000000000..37f94a48c5ffd4e78cbd48032b64713ef071ec5c --- /dev/null +++ b/1004-rasdaemon-align-event-name-in-log.patch @@ -0,0 +1,34 @@ +From 86a6cbb904a50269c901ba2ed591fde7debfa298 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 18 Mar 2025 15:52:41 +0800 +Subject: [PATCH 04/30] rasdaemon: align event name in log + +Now rasdaemon event name is not align in log: + + <...>-52503 [070] dNh. 0.007127 arm_event ... + <...>-52503 [052] .... 0.007127 memory_failure_event ... +Align it and result look like: + <...>-113714 [059] dNh. 0.007942 arm_event: ... + <...>-113714 [069] .... 0.007942 memory_failure_event: ... + +Signed-off-by: Ruidong Tian +--- + ras-events.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/ras-events.c b/ras-events.c +index 2220e9a..88c8a5f 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -418,7 +418,7 @@ static void parse_ras_data(struct pthread_data *pdata, struct kbuffer *kbuf, + /* TODO - logging */ + trace_seq_init(&s); + tep_print_event(pdata->ras->pevent, &s, &record, +- "%16s-%-5d [%03d] %s %6.1000d %s %s", ++ "%16s-%-10d [%03d] %s %6.1000d %25s: %s", + TEP_PRINT_COMM, TEP_PRINT_PID, TEP_PRINT_CPU, + TEP_PRINT_LATENCY, TEP_PRINT_TIME, TEP_PRINT_NAME, + TEP_PRINT_INFO); +-- +2.43.5 + diff --git a/1005-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch b/1005-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch deleted file mode 100644 index d7b912754d9ed525135cce368ccfec333bc86086..0000000000000000000000000000000000000000 --- a/1005-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch +++ /dev/null @@ -1,77 +0,0 @@ -From 5ed8df237ee0bc7f882259a8d05d7bce5cd98dab Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Wed, 20 Oct 2021 14:33:39 +0800 -Subject: [PATCH 05/85] rasdaemon: Fix some print format issues for hisi common - error section - -It is not right to use '%d' to print uint8_t and uint16_t, although -there is no function issue. Change to use '%hhu' and '%hu' separately. - -Signed-off-by: Xiaofei Tan -Signed-off-by: Mauro Carvalho Chehab ---- - non-standard-hisilicon.c | 20 ++++++++++---------- - 1 file changed, 10 insertions(+), 10 deletions(-) - -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index a6f5e78..41a9632 100644 ---- a/non-standard-hisilicon.c -+++ b/non-standard-hisilicon.c -@@ -192,7 +192,7 @@ static const char* get_soc_desc(uint8_t soc_id) - static void decode_module(struct hisi_event *event, uint8_t module_id) - { - if (module_id >= sizeof(module_name)/sizeof(char *)) -- HISI_SNPRINTF(event->error_msg, "module=unknown(id=%d) ", module_id); -+ HISI_SNPRINTF(event->error_msg, "module=unknown(id=%hhu) ", module_id); - else - HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]); - } -@@ -201,36 +201,36 @@ static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder, - const struct hisi_common_error_section *err, - struct hisi_event *event) - { -- HISI_SNPRINTF(event->error_msg, "[ table_version=%d", err->version); -+ HISI_SNPRINTF(event->error_msg, "[ table_version=%hhu", err->version); - if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID)) - HISI_SNPRINTF(event->error_msg, "soc=%s", get_soc_desc(err->soc_id)); - - if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID)) -- HISI_SNPRINTF(event->error_msg, "socket_id=%d", err->socket_id); -+ HISI_SNPRINTF(event->error_msg, "socket_id=%hhu", err->socket_id); - - if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID)) -- HISI_SNPRINTF(event->error_msg, "totem_id=%d", err->totem_id); -+ HISI_SNPRINTF(event->error_msg, "totem_id=%hhu", err->totem_id); - - if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID)) -- HISI_SNPRINTF(event->error_msg, "nimbus_id=%d", err->nimbus_id); -+ HISI_SNPRINTF(event->error_msg, "nimbus_id=%hhu", err->nimbus_id); - - if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID)) -- HISI_SNPRINTF(event->error_msg, "subsystem_id=%d", err->subsystem_id); -+ HISI_SNPRINTF(event->error_msg, "subsystem_id=%hhu", err->subsystem_id); - - if (err->val_bits & BIT(HISI_COMMON_VALID_MODULE_ID)) - decode_module(event, err->module_id); - - if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID)) -- HISI_SNPRINTF(event->error_msg, "submodule_id=%d", err->submodule_id); -+ HISI_SNPRINTF(event->error_msg, "submodule_id=%hhu", err->submodule_id); - - if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID)) -- HISI_SNPRINTF(event->error_msg, "core_id=%d", err->core_id); -+ HISI_SNPRINTF(event->error_msg, "core_id=%hhu", err->core_id); - - if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID)) -- HISI_SNPRINTF(event->error_msg, "port_id=%d", err->port_id); -+ HISI_SNPRINTF(event->error_msg, "port_id=%hhu", err->port_id); - - if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE)) -- HISI_SNPRINTF(event->error_msg, "err_type=%d", err->err_type); -+ HISI_SNPRINTF(event->error_msg, "err_type=%hu", err->err_type); - - if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO)) - HISI_SNPRINTF(event->error_msg, "pcie_device_id=%04x:%02x:%02x.%x", --- -2.33.1 - diff --git a/1005-rasdaemon-skip-doesn-t-exist-event.patch b/1005-rasdaemon-skip-doesn-t-exist-event.patch new file mode 100644 index 0000000000000000000000000000000000000000..1f6cbd2a611556a1f826bffbff65a0896af068f9 --- /dev/null +++ b/1005-rasdaemon-skip-doesn-t-exist-event.patch @@ -0,0 +1,56 @@ +From 7a13978040e6aa3e841cbbd5e6f91e5f98ae8d82 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 25 Mar 2025 10:16:13 +0800 +Subject: [PATCH 05/30] rasdaemon: skip doesn't exist event + +When compiling rasdaemon with the --enable-all configuration flag, +the system may detect unsupported hardware events - for instance, +ARM-specific events on x86 architectures. This causes the program +to enter a busy-wait loop in the wait_access function. A better +approach would be to explicitly skip these architecture-mismatched +events during initialization. + +Signed-off-by: Ruidong Tian +--- + ras-events.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/ras-events.c b/ras-events.c +index 88c8a5f..d42ed9f 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -826,6 +826,18 @@ static int select_tracing_timestamp(struct ras_events *ras) + return 0; + } + ++static bool check_event_exist(struct ras_events *ras, char *group, char *event) ++{ ++ char fname[MAX_PATH + 256]; ++ ++ snprintf(fname, sizeof(fname), "%s/tracing/events/%s/%s", ++ ras->debugfs, group, event); ++ if (access(fname, F_OK) == 0) ++ return true; ++ ++ return false; ++} ++ + #define EVENT_DISABLED 1 + + static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent, +@@ -837,6 +849,12 @@ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent, + char *page, fname[MAX_PATH + 1]; + struct tep_event_filter *filter = NULL; + ++ if (!check_event_exist(ras, group, event)) { ++ log(ALL, LOG_WARNING, "%s:%s event not exist\n", ++ group, event); ++ return -EINVAL; ++ } ++ + snprintf(fname, sizeof(fname), "events/%s/%s/format", group, event); + + fd = open_trace(ras, fname, O_RDONLY); +-- +2.43.5 + diff --git a/1006-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch b/1006-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch deleted file mode 100644 index 7d9563f6af0575418f00e49372cf095e99287c81..0000000000000000000000000000000000000000 --- a/1006-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch +++ /dev/null @@ -1,229 +0,0 @@ -From 29e82255ec841cc042e1f5733cfe267b02a78db8 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Wed, 2 Mar 2022 12:20:40 +0000 -Subject: [PATCH 06/85] rasdaemon: Modify recording Hisilicon common error data - -The error statistics for the Hisilicon common -error need to do based on module, error severity etc. - -Modify recording Hisilicon common error data as separate fields -in the sql db table instead of the combined single field. - -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - non-standard-hisilicon.c | 126 ++++++++++++++++++++++++++++++++------- - 1 file changed, 104 insertions(+), 22 deletions(-) - -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index 41a9632..cd0ab3f 100644 ---- a/non-standard-hisilicon.c -+++ b/non-standard-hisilicon.c -@@ -17,6 +17,7 @@ - #include "non-standard-hisilicon.h" - - #define HISI_BUF_LEN 2048 -+#define HISI_PCIE_INFO_BUF_LEN 256 - - struct hisi_common_error_section { - uint32_t val_bits; -@@ -63,12 +64,25 @@ enum { - enum { - HISI_COMMON_FIELD_ID, - HISI_COMMON_FIELD_TIMESTAMP, -- HISI_COMMON_FIELD_ERR_INFO, -+ HISI_COMMON_FIELD_VERSION, -+ HISI_COMMON_FIELD_SOC_ID, -+ HISI_COMMON_FIELD_SOCKET_ID, -+ HISI_COMMON_FIELD_TOTEM_ID, -+ HISI_COMMON_FIELD_NIMBUS_ID, -+ HISI_COMMON_FIELD_SUB_SYSTEM_ID, -+ HISI_COMMON_FIELD_MODULE_ID, -+ HISI_COMMON_FIELD_SUB_MODULE_ID, -+ HISI_COMMON_FIELD_CORE_ID, -+ HISI_COMMON_FIELD_PORT_ID, -+ HISI_COMMON_FIELD_ERR_TYPE, -+ HISI_COMMON_FIELD_PCIE_INFO, -+ HISI_COMMON_FIELD_ERR_SEVERITY, - HISI_COMMON_FIELD_REGS_DUMP, - }; - - struct hisi_event { - char error_msg[HISI_BUF_LEN]; -+ char pcie_info[HISI_PCIE_INFO_BUF_LEN]; - char reg_msg[HISI_BUF_LEN]; - }; - -@@ -126,14 +140,26 @@ int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name) - - #ifdef HAVE_SQLITE3 - static const struct db_fields hisi_common_section_fields[] = { -- { .name = "id", .type = "INTEGER PRIMARY KEY" }, -- { .name = "timestamp", .type = "TEXT" }, -- { .name = "err_info", .type = "TEXT" }, -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, -+ { .name = "version", .type = "INTEGER" }, -+ { .name = "soc_id", .type = "INTEGER" }, -+ { .name = "socket_id", .type = "INTEGER" }, -+ { .name = "totem_id", .type = "INTEGER" }, -+ { .name = "nimbus_id", .type = "INTEGER" }, -+ { .name = "sub_system_id", .type = "INTEGER" }, -+ { .name = "module_id", .type = "TEXT" }, -+ { .name = "sub_module_id", .type = "INTEGER" }, -+ { .name = "core_id", .type = "INTEGER" }, -+ { .name = "port_id", .type = "INTEGER" }, -+ { .name = "err_type", .type = "INTEGER" }, -+ { .name = "pcie_info", .type = "TEXT" }, -+ { .name = "err_severity", .type = "TEXT" }, - { .name = "regs_dump", .type = "TEXT" }, - }; - - static const struct db_table_descriptor hisi_common_section_tab = { -- .name = "hisi_common_section", -+ .name = "hisi_common_section_v2", - .fields = hisi_common_section_fields, - .num_fields = ARRAY_SIZE(hisi_common_section_fields), - }; -@@ -189,12 +215,20 @@ static const char* get_soc_desc(uint8_t soc_id) - return soc_desc[soc_id]; - } - --static void decode_module(struct hisi_event *event, uint8_t module_id) -+static void decode_module(struct ras_ns_ev_decoder *ev_decoder, -+ struct hisi_event *event, uint8_t module_id) - { -- if (module_id >= sizeof(module_name)/sizeof(char *)) -+ if (module_id >= sizeof(module_name)/sizeof(char *)) { - HISI_SNPRINTF(event->error_msg, "module=unknown(id=%hhu) ", module_id); -- else -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, -+ HISI_COMMON_FIELD_MODULE_ID, -+ 0, "unknown"); -+ } else { - HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, -+ HISI_COMMON_FIELD_MODULE_ID, -+ 0, module_name[module_id]); -+ } - } - - static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder, -@@ -202,43 +236,93 @@ static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder, - struct hisi_event *event) - { - HISI_SNPRINTF(event->error_msg, "[ table_version=%hhu", err->version); -- if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID)) -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_VERSION, -+ err->version, NULL); -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID)) { - HISI_SNPRINTF(event->error_msg, "soc=%s", get_soc_desc(err->soc_id)); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_SOC_ID, -+ err->soc_id, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID)) { - HISI_SNPRINTF(event->error_msg, "socket_id=%hhu", err->socket_id); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_SOCKET_ID, -+ err->socket_id, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID)) { - HISI_SNPRINTF(event->error_msg, "totem_id=%hhu", err->totem_id); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_TOTEM_ID, -+ err->totem_id, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID)) { - HISI_SNPRINTF(event->error_msg, "nimbus_id=%hhu", err->nimbus_id); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_NIMBUS_ID, -+ err->nimbus_id, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID)) { - HISI_SNPRINTF(event->error_msg, "subsystem_id=%hhu", err->subsystem_id); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_SUB_SYSTEM_ID, -+ err->subsystem_id, NULL); -+ } - - if (err->val_bits & BIT(HISI_COMMON_VALID_MODULE_ID)) -- decode_module(event, err->module_id); -+ decode_module(ev_decoder, event, err->module_id); - -- if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID)) { - HISI_SNPRINTF(event->error_msg, "submodule_id=%hhu", err->submodule_id); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_SUB_MODULE_ID, -+ err->submodule_id, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID)) { - HISI_SNPRINTF(event->error_msg, "core_id=%hhu", err->core_id); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_CORE_ID, -+ err->core_id, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID)) { - HISI_SNPRINTF(event->error_msg, "port_id=%hhu", err->port_id); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_PORT_ID, -+ err->port_id, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE)) { - HISI_SNPRINTF(event->error_msg, "err_type=%hu", err->err_type); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT, -+ HISI_COMMON_FIELD_ERR_TYPE, -+ err->err_type, NULL); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO)) { - HISI_SNPRINTF(event->error_msg, "pcie_device_id=%04x:%02x:%02x.%x", - err->pcie_info.segment, err->pcie_info.bus, - err->pcie_info.device, err->pcie_info.function); -+ HISI_SNPRINTF(event->pcie_info, "%04x:%02x:%02x.%x", -+ err->pcie_info.segment, err->pcie_info.bus, -+ err->pcie_info.device, err->pcie_info.function); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, -+ HISI_COMMON_FIELD_PCIE_INFO, -+ 0, event->pcie_info); -+ } - -- if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY)) -+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY)) { - HISI_SNPRINTF(event->error_msg, "err_severity=%s", err_severity(err->err_severity)); -+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, -+ HISI_COMMON_FIELD_ERR_SEVERITY, -+ 0, err_severity(err->err_severity)); -+ } - - HISI_SNPRINTF(event->error_msg, "]"); - } -@@ -283,8 +367,6 @@ static int decode_hisi_common_section(struct ras_events *ras, - record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HISI_COMMON_FIELD_TIMESTAMP, - 0, event->timestamp); -- record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, -- HISI_COMMON_FIELD_ERR_INFO, 0, hevent.error_msg); - record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HISI_COMMON_FIELD_REGS_DUMP, 0, hevent.reg_msg); - step_vendor_data_tab(ev_decoder, "hisi_common_section_tab"); --- -2.33.1 - diff --git a/1006-rasdaemon-support-memory-corrected-error-statistics.patch b/1006-rasdaemon-support-memory-corrected-error-statistics.patch new file mode 100644 index 0000000000000000000000000000000000000000..845a3600e10148471f43eae1d8d2bce288cf1cb3 --- /dev/null +++ b/1006-rasdaemon-support-memory-corrected-error-statistics.patch @@ -0,0 +1,124 @@ +From 32bd3dc84cc235dc589ae6ac149a3567c7b501a6 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Tue, 25 Mar 2025 18:36:07 +0800 +Subject: [PATCH 06/30] rasdaemon: support memory corrected error statistics + +A high volume of Correctable Errors (CEs) indicates that the +memory controller is frequently performing Error-Correcting Code (ECC) +operations, which will increase memory controller latency. +The CE statistics feature can report the number of CEs occurring per +second. When the count exceeds a certain threshold, it signifies +intensive ECC activity and triggers warnings. + +New environment MC_CE_STAT_THRESHOLD to setup threshold. + +Signed-off-by: Ruidong Tian +--- + misc/rasdaemon.env | 5 +++++ + ras-mc-handler.c | 23 +++++++++++++++++++++++ + ras-mc-handler.h | 1 + + rasdaemon.c | 7 +++++++ + 4 files changed, 36 insertions(+) + +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 963aaa0..4375781 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -88,3 +88,8 @@ TRIGGER_DIR= + # MC_UE_TRIGGER=mc_event_trigger + MC_CE_TRIGGER= + MC_UE_TRIGGER= ++ ++# CE Statistic Threshold ++# ++# Specify the threshold of CE per second. ++MC_CE_STAT_THRESHOLD=2000 +\ No newline at end of file +diff --git a/ras-mc-handler.c b/ras-mc-handler.c +index fdd85a9..7a18f73 100644 +--- a/ras-mc-handler.c ++++ b/ras-mc-handler.c +@@ -103,6 +103,27 @@ free: + free(env[i]); + } + ++static unsigned long long per_sec_ce_count; ++unsigned long long mc_ce_stat_threshold; ++static time_t cur; ++static int ras_mc_event_stat(time_t now, struct ras_mc_event *e) ++{ ++ if (strcmp(e->error_type, "Corrected")) ++ return 0; ++ ++ if (cur == now) { ++ per_sec_ce_count += e->error_count; ++ } else { ++ cur = now; ++ per_sec_ce_count = e->error_count; ++ } ++ ++ if (per_sec_ce_count > mc_ce_stat_threshold) ++ log(ALL, LOG_ERR, " mc_event_stat: memory corrected error report %lld/sec\n", per_sec_ce_count); ++ ++ return 0; ++} ++ + int ras_mc_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context) +@@ -263,6 +284,8 @@ int ras_mc_event_handler(struct trace_seq *s, + + ras_store_mc_event(ras, &ev); + ++ ras_mc_event_stat(now, &ev); ++ + #ifdef HAVE_MEMORY_CE_PFA + /* Account page corrected errors */ + if (!strcmp(ev.error_type, "Corrected")) +diff --git a/ras-mc-handler.h b/ras-mc-handler.h +index 2aa3c28..cf12959 100644 +--- a/ras-mc-handler.h ++++ b/ras-mc-handler.h +@@ -10,6 +10,7 @@ + #include + + #include "ras-events.h" ++extern unsigned long long mc_ce_stat_threshold; + + void mc_event_trigger_setup(void); + +diff --git a/rasdaemon.c b/rasdaemon.c +index 840be61..d97665f 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -13,6 +13,7 @@ + #include "ras-events.h" + #include "ras-logger.h" + #include "ras-record.h" ++#include "ras-mc-handler.h" + #include "types.h" + + /* +@@ -23,6 +24,7 @@ + #define TOOL_DESCRIPTION "RAS daemon to log the RAS events." + #define ARGS_DOC "" + #define DISABLE "DISABLE" ++#define MC_CE_STAT_THRESHOLD "MC_CE_STAT_THRESHOLD" + + const char *argp_program_version = TOOL_NAME " " VERSION; + const char *argp_program_bug_address = "Mauro Carvalho Chehab "; +@@ -126,6 +128,11 @@ int main(int argc, char *argv[]) + + choices_disable = getenv(DISABLE); + ++ if (getenv(MC_CE_STAT_THRESHOLD)) ++ mc_ce_stat_threshold = strtoull(getenv(MC_CE_STAT_THRESHOLD), NULL, 0); ++ if (mc_ce_stat_threshold) ++ log(TERM, LOG_INFO, "Threshold of memory Corrected Errors statistics is %lld\n", mc_ce_stat_threshold); ++ + #ifdef HAVE_MCE + const struct argp_option offline_options[] = { + {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"}, +-- +2.43.5 + diff --git a/1007-rasdaemon-introduce-poison-page-statistics.patch b/1007-rasdaemon-introduce-poison-page-statistics.patch new file mode 100644 index 0000000000000000000000000000000000000000..12dd35e30ffa21be01e8f81b6af46496eee12fd5 --- /dev/null +++ b/1007-rasdaemon-introduce-poison-page-statistics.patch @@ -0,0 +1,249 @@ +From 9e9a9b7cd802f7874f674fb024ef0dd93e223060 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Wed, 26 Mar 2025 14:03:33 +0800 +Subject: [PATCH 07/30] rasdaemon: introduce poison page statistics + +An excessive number of poison pages can lead to memory fragmentation, +which may degrade system performance. This patch introduces a threshold +monitoring mechanism for poison pages. When the number of poison pages +exceeds the predefined threshold, a warning is issued to alert +administrators. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 7 +++++- + configure.ac | 6 ++++++ + misc/rasdaemon.env | 8 ++++++- + ras-memory-failure-handler.c | 5 +++++ + ras-memory-failure-handler.h | 2 ++ + ras-page-isolation.c | 6 ++++++ + ras-poison-page-stat.c | 41 ++++++++++++++++++++++++++++++++++++ + ras-poison-page-stat.h | 14 ++++++++++++ + rasdaemon.c | 9 ++++++++ + 9 files changed, 96 insertions(+), 2 deletions(-) + create mode 100644 ras-poison-page-stat.c + create mode 100644 ras-poison-page-stat.h + +diff --git a/Makefile.am b/Makefile.am +index 1306d97..56e992d 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -116,6 +116,10 @@ if WITH_SIGNAL + rasdaemon_SOURCES += ras-signal-handler.c + endif + ++if WITH_POISON_PAGE_STAT ++ rasdaemon_SOURCES += ras-poison-page-stat.c ++endif ++ + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) + rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) + +@@ -125,7 +129,8 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ + ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ +- non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h ++ non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \ ++ ras-poison-page-stat.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 25e0cb2..5fe1862 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -254,6 +254,12 @@ AS_IF([test "x$enable_signal" = "xyes" || test "x$enable_all" == "xyes"], [ + AM_CONDITIONAL([WITH_SIGNAL], [test x$enable_signal = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_SIGNAL], [USE_SIGNAL="yes"], [USE_SIGNAL="no"]) + ++AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_memory_row_ce_pfa" = "xyes" || test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_POISON_PAGE_STAT,1,"have poison page statistics") ++ AC_SUBST([WITH_POISON_PAGE_STAT]) ++]) ++AM_CONDITIONAL([WITH_POISON_PAGE_STAT], [test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_memory_row_ce_pfa" = "xyes" || test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes" ]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 4375781..3aa3a0d 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -92,4 +92,10 @@ MC_UE_TRIGGER= + # CE Statistic Threshold + # + # Specify the threshold of CE per second. +-MC_CE_STAT_THRESHOLD=2000 +\ No newline at end of file ++MC_CE_STAT_THRESHOLD=2000 ++ ++# Poison page statistics ++# ++# Supported units: ++# POISON_STAT_THRESHOLD: kB ++POISON_STAT_THRESHOLD=102400 +diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c +index 4d20ce8..d4c293b 100644 +--- a/ras-memory-failure-handler.c ++++ b/ras-memory-failure-handler.c +@@ -12,6 +12,7 @@ + + #include "ras-logger.h" + #include "ras-memory-failure-handler.h" ++#include "ras-poison-page-stat.h" + #include "ras-report.h" + #include "trigger.h" + #include "types.h" +@@ -208,6 +209,10 @@ int ras_memory_failure_event_handler(struct trace_seq *s, + ev.action_result = get_action_result(val); + trace_seq_printf(s, "action_result=%s ", ev.action_result); + ++#ifdef HAVE_POISON_PAGE_STAT ++ ras_poison_page_stat(); ++#endif ++ + /* Store data into the SQLite DB */ + #ifdef HAVE_SQLITE3 + ras_store_mf_event(ras, &ev); +diff --git a/ras-memory-failure-handler.h b/ras-memory-failure-handler.h +index f0cea71..85e2dd2 100644 +--- a/ras-memory-failure-handler.h ++++ b/ras-memory-failure-handler.h +@@ -11,6 +11,8 @@ + + #include "ras-events.h" + ++extern unsigned long long poison_stat_threshold; ++ + void mem_fail_event_trigger_setup(void); + int ras_memory_failure_event_handler(struct trace_seq *s, + struct tep_record *record, +diff --git a/ras-page-isolation.c b/ras-page-isolation.c +index 2166f5c..246cd12 100644 +--- a/ras-page-isolation.c ++++ b/ras-page-isolation.c +@@ -15,6 +15,8 @@ + + #include "ras-logger.h" + #include "ras-page-isolation.h" ++#include "ras-poison-page-stat.h" ++#include "ras-record.h" + + #define PARSED_ENV_LEN 50 + #define ROW_ID_MAX_LEN 200 +@@ -349,6 +351,10 @@ static void page_offline(struct page_record *pr) + + log(TERM, LOG_INFO, "Result of offlining page at %#llx: %s\n", + addr, page_state[pr->offlined]); ++ ++#ifdef HAVE_POISON_PAGE_STAT ++ ras_poison_page_stat(); ++#endif + } + + static void page_record(struct page_record *pr, unsigned int count, time_t time) +diff --git a/ras-poison-page-stat.c b/ras-poison-page-stat.c +new file mode 100644 +index 0000000..2ce1d2a +--- /dev/null ++++ b/ras-poison-page-stat.c +@@ -0,0 +1,41 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "ras-logger.h" ++#include "ras-poison-page-stat.h" ++#include "types.h" ++ ++unsigned long long poison_stat_threshold; ++int ras_poison_page_stat(void) ++{ ++ FILE *fp; ++ char line[MAX_PATH]; ++ unsigned long long corrupted_kb = 0; ++ ++ fp = fopen("/proc/meminfo", "r"); ++ if (!fp) { ++ log(ALL, LOG_ERR, "Failed to open /proc/meminfo"); ++ return EXIT_FAILURE; ++ } ++ ++ while (fgets(line, sizeof(line), fp)) ++ if (strstr(line, "HardwareCorrupted")) ++ if (sscanf(line, "%*s %llukB", &corrupted_kb) == 1) ++ break; ++ ++ fclose(fp); ++ ++ if (corrupted_kb > poison_stat_threshold) ++ log(ALL, LOG_WARNING, "Poison page statistics exceeded threshold: %lld kB (threshold: %lld kB)\n", ++ corrupted_kb, poison_stat_threshold); ++ ++ return 0; ++} +diff --git a/ras-poison-page-stat.h b/ras-poison-page-stat.h +new file mode 100644 +index 0000000..4fe25d2 +--- /dev/null ++++ b/ras-poison-page-stat.h +@@ -0,0 +1,14 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#ifndef __RAS_POISON_PAGE_STAT_H ++#define __RAS_POISON_PAGE_STAT_H ++ ++extern unsigned long long poison_stat_threshold; ++ ++int ras_poison_page_stat(void); ++ ++#endif +diff --git a/rasdaemon.c b/rasdaemon.c +index d97665f..6505dee 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -12,6 +12,7 @@ + + #include "ras-events.h" + #include "ras-logger.h" ++#include "ras-poison-page-stat.h" + #include "ras-record.h" + #include "ras-mc-handler.h" + #include "types.h" +@@ -25,6 +26,7 @@ + #define ARGS_DOC "" + #define DISABLE "DISABLE" + #define MC_CE_STAT_THRESHOLD "MC_CE_STAT_THRESHOLD" ++#define POISON_STAT_THRESHOLD "POISON_STAT_THRESHOLD" + + const char *argp_program_version = TOOL_NAME " " VERSION; + const char *argp_program_bug_address = "Mauro Carvalho Chehab "; +@@ -133,6 +135,13 @@ int main(int argc, char *argv[]) + if (mc_ce_stat_threshold) + log(TERM, LOG_INFO, "Threshold of memory Corrected Errors statistics is %lld\n", mc_ce_stat_threshold); + ++#ifdef HAVE_POISON_PAGE_STAT ++ if (getenv(POISON_STAT_THRESHOLD)) ++ poison_stat_threshold = strtoull(getenv(POISON_STAT_THRESHOLD), NULL, 0); ++ if (poison_stat_threshold) ++ log(TERM, LOG_INFO, "Threshold of poison page statistics is %lld kB\n", poison_stat_threshold); ++#endif ++ + #ifdef HAVE_MCE + const struct argp_option offline_options[] = { + {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"}, +-- +2.43.5 + diff --git a/1007-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch b/1007-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch deleted file mode 100644 index 351b5100f29a1f004be0eef5aa6d82efc4a27c4b..0000000000000000000000000000000000000000 --- a/1007-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch +++ /dev/null @@ -1,98 +0,0 @@ -From ee109afe0bf76c436c19fa0dc8ec70ded87e2677 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Thu, 24 Feb 2022 18:02:14 +0000 -Subject: [PATCH 07/85] rasdaemon: ras-mc-ctl: Modify error statistics for - HiSilicon KunPeng9xx common errors - -Modify the error statistics for the HiSilicon KunPeng9xx platforms common errors -to display the statistics and error info based on the module and the error severity. - -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - util/ras-mc-ctl.in | 40 +++++++++++++++++++++++++++++----------- - 1 file changed, 29 insertions(+), 11 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 0691f29..c4bef8f 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1537,7 +1537,7 @@ sub vendor_errors_summary - require DBI; - my ($num_args, $platform_id); - my ($query, $query_handle, $count, $out); -- my ($module_id, $sub_module_id, $err_severity, $err_sev, $err_info); -+ my ($module_id, $sub_module_id, $err_severity, $err_sev); - - $num_args = $#ARGV + 1; - $platform_id = 0; -@@ -1614,13 +1614,18 @@ sub vendor_errors_summary - - # HiSilicon Kunpeng9xx common errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { -- $query = "select err_info, count(*) from hisi_common_section"; -+ $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -- $query_handle->bind_columns(\($err_info, $count)); -+ $query_handle->bind_columns(\($err_severity, $module_id, $count)); - $out = ""; -+ $err_sev = ""; - while($query_handle->fetch()) { -- $out .= "\terrors: $count\n"; -+ if ($err_severity ne $err_sev) { -+ $out .= "$err_severity errors:\n"; -+ $err_sev = $err_severity; -+ } -+ $out .= "\t$module_id: $count\n"; - } - if ($out ne "") { - print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; -@@ -1638,8 +1643,8 @@ sub vendor_errors - require DBI; - my ($num_args, $platform_id); - my ($query, $query_handle, $id, $timestamp, $out); -- my ($version, $soc_id, $socket_id, $nimbus_id, $core_id, $port_id); -- my ($module_id, $sub_module_id, $err_severity, $err_type, $err_info, $regs); -+ my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id); -+ my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs); - - $num_args = $#ARGV + 1; - $platform_id = 0; -@@ -1727,15 +1732,28 @@ sub vendor_errors - - # HiSilicon Kunpeng9xx common errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { -- $query = "select id, timestamp, err_info, regs_dump from hisi_common_section order by id"; -+ $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -- $query_handle->bind_columns(\($id, $timestamp, $err_info, $regs)); -+ $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs)); - $out = ""; - while($query_handle->fetch()) { -- $out .= "$id. $timestamp "; -- $out .= "Error Info:$err_info \n" if ($err_info); -- $out .= "Error Registers: $regs\n\n" if ($regs); -+ $out .= "$id. $timestamp Error Info: "; -+ $out .= "version=$version, "; -+ $out .= "soc_id=$soc_id, " if ($soc_id); -+ $out .= "socket_id=$socket_id, " if ($socket_id); -+ $out .= "totem_id=$totem_id, " if ($totem_id); -+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -+ $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id); -+ $out .= "module_id=$module_id, " if ($module_id); -+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -+ $out .= "core_id=$core_id, " if ($core_id); -+ $out .= "port_id=$port_id, " if ($port_id); -+ $out .= "err_type=$err_type, " if ($err_type); -+ $out .= "pcie_info=$pcie_info, " if ($pcie_info); -+ $out .= "err_severity=$err_severity, " if ($err_severity); -+ $out .= "Error Registers: $regs" if ($regs); -+ $out .= "\n\n"; - } - if ($out ne "") { - print "HiSilicon Kunpeng9xx common error events:\n$out\n"; --- -2.33.1 - diff --git a/1008-rasdaemon-erst-decode-panic-mce-through-erst.patch b/1008-rasdaemon-erst-decode-panic-mce-through-erst.patch new file mode 100644 index 0000000000000000000000000000000000000000..ee72a7446bf0c4fe82dea7b494a4d696e6295268 --- /dev/null +++ b/1008-rasdaemon-erst-decode-panic-mce-through-erst.patch @@ -0,0 +1,468 @@ +From d64ff047a5ab231ee6c1a797dc3ce612fb7a5a6c Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 12 Dec 2024 09:37:06 +0800 +Subject: [PATCH 08/30] rasdaemon: erst: decode panic mce through erst + +ERST records the MCE information that caused the kernel panic, +helping us determine the cause of the last crash. +Using rasdaemon to check and parse the ERST records at startup. +Decoded info like follow: + <...>-0 [-01] .... 0.000000 mce_erst_record: 2025-03-26 14:52:42 +0800 bank=1, status= bd80000000100134, Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error, mci=Uncorrected_error Error_enabled SRAR Uncorrected_error Error_enabled SRAR Uncorrected_error Error_enabled SRAR, mca=Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error Data CACHE Level-0 Data-Read Error K, cpu_type= Sapphirerapids server, cpu= 159, socketid= 1, ip= ffffffff914a6476, cs= 10, misc= 86, addr= 8158f58400, mcgstatus=15 RIPV EIPV MCIP LMCE mcgstatus=15 RIPV EIPV MCIP LMCE mcgstatus=15 RIPV EIPV MCIP LMCE, mcgcap= f000c15, apicid= 9f, ppin= fc6b80e0ba9d616, microcode= 2b000571 + +Now environment ERST_DELETE is introduced, rasdaemon will delete +origin erst file if ERST_DELETE set. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 5 +- + configure.ac | 11 +++ + misc/rasdaemon.env | 2 + + ras-erst.c | 195 +++++++++++++++++++++++++++++++++++++++++++++ + ras-erst.h | 17 ++++ + ras-mce-handler.c | 35 ++++++-- + ras-mce-handler.h | 4 + + ras-record.h | 4 + + rasdaemon.c | 11 +++ + 9 files changed, 275 insertions(+), 9 deletions(-) + create mode 100644 ras-erst.c + create mode 100644 ras-erst.h + +diff --git a/Makefile.am b/Makefile.am +index 56e992d..e1bcda1 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -119,6 +119,9 @@ endif + if WITH_POISON_PAGE_STAT + rasdaemon_SOURCES += ras-poison-page-stat.c + endif ++if WITH_ERST ++ rasdaemon_SOURCES += ras-erst.c ++endif + + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) + rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) +@@ -130,7 +133,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ + ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ + non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \ +- ras-poison-page-stat.h ++ ras-poison-page-stat.h ras-erst.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 5fe1862..47e6346 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -260,6 +260,16 @@ AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_memory_row_ce_pf + ]) + AM_CONDITIONAL([WITH_POISON_PAGE_STAT], [test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_memory_row_ce_pfa" = "xyes" || test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes" ]) + ++AC_ARG_ENABLE([erst], ++ AS_HELP_STRING([--enable-erst], [enable erst (currently experimental)])) ++ ++AS_IF([test "x$enable_erst" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_ERST,1,"have ERST") ++ AC_SUBST([WITH_ERST]) ++]) ++AM_CONDITIONAL([WITH_ERST], [test x$enable_erst = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_ERST], [USE_ERST="yes"], [USE_ERST="no"]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -307,4 +317,5 @@ compile time options summary + YITIAN RAS errors : $USE_YITIAN_NS_DECODE + JAGUAR RAS errors : $USE_JAGUAR_NS_DECODE + Signal : $USE_SIGNAL ++ ERST : $USE_ERST + EOF +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 3aa3a0d..193ee19 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -99,3 +99,5 @@ MC_CE_STAT_THRESHOLD=2000 + # Supported units: + # POISON_STAT_THRESHOLD: kB + POISON_STAT_THRESHOLD=102400 ++ ++ERST_DELETE=1 +diff --git a/ras-erst.c b/ras-erst.c +new file mode 100644 +index 0000000..c024d60 +--- /dev/null ++++ b/ras-erst.c +@@ -0,0 +1,195 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "ras-events.h" ++#include "ras-erst.h" ++#include "ras-logger.h" ++#include "ras-mce-handler.h" ++#include "ras-record.h" ++#include "types.h" ++ ++struct mce { ++ uint64_t status; /* Bank's MCi_STATUS MSR */ ++ uint64_t misc; /* Bank's MCi_MISC MSR */ ++ uint64_t addr; /* Bank's MCi_ADDR MSR */ ++ uint64_t mcgstatus; /* Machine Check Global Status MSR */ ++ uint64_t ip; /* Instruction Pointer when the error happened */ ++ uint64_t tsc; /* CPU time stamp counter */ ++ uint64_t time; /* Wall time_t when error was detected */ ++ uint8_t cpuvendor; /* Kernel's X86_VENDOR enum */ ++ uint8_t inject_flags; /* Software inject flags */ ++ uint8_t severity; /* Error severity */ ++ uint8_t pad; ++ uint32_t cpuid; /* CPUID 1 EAX */ ++ uint8_t cs; /* Code segment */ ++ uint8_t bank; /* Machine check bank reporting the error */ ++ uint8_t cpu; /* CPU number; obsoleted by extcpu */ ++ uint8_t finished; /* Entry is valid */ ++ uint32_t extcpu; /* Linux CPU number that detected the error */ ++ uint32_t socketid; /* CPU socket ID */ ++ uint32_t apicid; /* CPU initial APIC ID */ ++ uint64_t mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ ++ uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */ ++ uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */ ++ uint64_t ppin; /* Protected Processor Inventory Number */ ++ uint32_t microcode; /* Microcode revision */ ++}; ++ ++static int erst_delete; ++ ++#define ERST_PATH "/sys/fs/pstore/erst" ++#define MCE_ERST_PREFIX "mce-erst" ++#define ERST_EVENT_NAME "mce_erst_record" ++ ++#ifdef HAVE_MCE ++static void ras_erst_mce_handler(struct ras_events *ras, struct mce_event *e) ++{ ++ struct mce_priv *mce = ras->mce_priv; ++ struct trace_seq s; ++ int rc = 0; ++ ++ switch (mce->cputype) { ++ case CPU_GENERIC: ++ break; ++ case CPU_K8: ++ rc = parse_amd_k8_event(ras, e); ++ break; ++ case CPU_AMD_SMCA: ++ case CPU_DHYANA: ++ rc = parse_amd_smca_event(ras, e); ++ break; ++ default: /* All other CPU types are Intel */ ++ rc = parse_intel_event(ras, e); ++ } ++ ++ if (rc) ++ return; ++ ++ mce_snprintf(e->error_msg, "%s", e->mcastatus_msg); ++ ++ trace_seq_init(&s); ++ trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", ++ "<...>", 0, -1, "....", 0.0f, ERST_EVENT_NAME); ++ ++ report_mce_event(ras, NULL, &s, e); ++ trace_seq_terminate(&s); ++ trace_seq_do_printf(&s); ++ printf("\n"); ++ fflush(stdout); ++ trace_seq_destroy(&s); ++} ++ ++static void handle_erst_mce_file(char *path, struct mce_event *e) ++{ ++ FILE *file; ++ struct mce mce; ++ struct stat file_stat; ++ ++ file = fopen(path, "r"); ++ if (!file) { ++ log(ALL, LOG_ERR, "Failed to open file %s\n", path); ++ return; ++ } ++ ++ if (stat(path, &file_stat) < 0) { ++ log(ALL, LOG_ERR, "Failed to stat file %s\n", path); ++ goto out; ++ } ++ ++ if (fread((char *)&mce, 1, sizeof(mce), file) < sizeof(mce)) { ++ log(ALL, LOG_ERR, "Failed to read file %s\n", path); ++ goto out; ++ } ++ ++ e->mcgcap = mce.mcgcap; ++ e->mcgstatus = mce.mcgstatus; ++ ++ e->status = mce.status; ++ e->addr = mce.addr; ++ e->misc = mce.misc; ++ e->synd = mce.synd; ++ e->ipid = mce.ipid; ++ e->ip = mce.ip; ++ e->tsc = mce.tsc; ++ e->walltime = mce.time; ++ e->cpu = mce.extcpu; ++ e->cpuid = mce.cpuid; ++ e->apicid = mce.apicid; ++ e->socketid = mce.socketid; ++ e->cs = mce.cs; ++ e->bank = mce.bank; ++ e->cpuvendor = mce.cpuvendor; ++ e->ppin = mce.ppin; ++ e->microcode = mce.microcode; ++ ++ if (erst_delete) { ++ if (!unlink(path)) ++ log(ALL, LOG_INFO, "Error deleting file %s\n", path); ++ else ++ log(ALL, LOG_ERR, "Failed to delete file %s\n", path); ++ } ++ ++out: ++ fclose(file); ++} ++ ++static void handle_erst_mce(void) ++{ ++ int rc; ++ struct ras_events ras = { 0 }; ++ struct dirent *entry; ++ DIR *dir; ++ ++ rc = init_mce_priv(&ras); ++ if (rc) { ++ log(ALL, LOG_INFO, "Can't register mce handler\n"); ++ return; ++ } ++ ++ dir = opendir(ERST_PATH); ++ if (!dir) { ++ log(ALL, LOG_INFO, "Failed to open directory\n"); ++ return; ++ } ++ ++ while ((entry = readdir(dir)) != NULL) { ++ struct stat path_stat; ++ char file_path[MAX_PATH]; ++ struct mce_event mce = { 0 }; ++ ++ mce.erst = 1; ++ if (strncmp(entry->d_name, MCE_ERST_PREFIX, strlen(MCE_ERST_PREFIX))) ++ continue; ++ ++ snprintf(file_path, sizeof(file_path), "%s/%s", ERST_PATH, entry->d_name); ++ stat(file_path, &path_stat); ++ ++ if (S_ISREG(path_stat.st_mode)) { ++ handle_erst_mce_file(file_path, &mce); ++ } else { ++ log(TERM, LOG_ERR, "Unexpected file type\n"); ++ continue; ++ } ++ ++ ras_erst_mce_handler(&ras, &mce); ++ } ++ ++ closedir(dir); ++} ++#endif ++/* ERST just support mce now */ ++void handle_erst(void) ++{ ++ if (getenv(ERST_DELETE)) ++ erst_delete = atoi(getenv(ERST_DELETE)); ++ ++ handle_erst_mce(); ++} +diff --git a/ras-erst.h b/ras-erst.h +new file mode 100644 +index 0000000..83d7535 +--- /dev/null ++++ b/ras-erst.h +@@ -0,0 +1,17 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#ifndef __RAS_ERST_H ++#define __RAS_ERST_H ++ ++#define ERST_DELETE "ERST_DELETE" ++ ++#ifdef HAVE_MCE ++void handle_erst_mce(void); ++#endif ++ ++void handle_erst(void); ++#endif +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 8713390..3d8d97d 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -228,7 +228,7 @@ ret: + return ret; + } + +-int register_mce_handler(struct ras_events *ras, unsigned int ncpus) ++int init_mce_priv(struct ras_events *ras) + { + int rc; + struct mce_priv *mce; +@@ -249,6 +249,11 @@ int register_mce_handler(struct ras_events *ras, unsigned int ncpus) + ras->mce_priv = NULL; + return rc; + } ++ ++ return rc; ++} ++static void set_imc_log(struct mce_priv *mce, unsigned int ncpus) ++{ + switch (mce->cputype) { + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: +@@ -259,6 +264,17 @@ int register_mce_handler(struct ras_events *ras, unsigned int ncpus) + default: + break; + } ++} ++ ++int register_mce_handler(struct ras_events *ras, unsigned int ncpus) ++{ ++ int rc; ++ ++ rc = init_mce_priv(ras); ++ if (rc) ++ return rc; ++ ++ set_imc_log(ras->mce_priv, ncpus); + + return rc; + } +@@ -267,9 +283,8 @@ int register_mce_handler(struct ras_events *ras, unsigned int ncpus) + * End of mcelog's code + */ + +-static void report_mce_event(struct ras_events *ras, +- struct tep_record *record, +- struct trace_seq *s, struct mce_event *e) ++void report_mce_event(struct ras_events *ras, struct tep_record *record, ++ struct trace_seq *s, struct mce_event *e) + { + time_t now; + struct tm *tm; +@@ -284,10 +299,14 @@ static void report_mce_event(struct ras_events *ras, + * not available (legacy kernels). + */ + +- if (ras->use_uptime) +- now = record->ts / user_hz + ras->uptime_diff; +- else +- now = time(NULL); ++ if (!e->erst) { ++ if (ras->use_uptime) ++ now = record->ts / user_hz + ras->uptime_diff; ++ else ++ now = time(NULL); ++ } else { ++ now = e->walltime; ++ } + + tm = localtime(&now); + if (tm) +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index 57984ec..f120874 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -78,6 +78,7 @@ struct mce_event { + char mcastatus_msg[1024]; + char user_action[4096]; + char mc_location[256]; ++ int erst; + }; + + struct mce_priv { +@@ -108,6 +109,7 @@ int register_mce_handler(struct ras_events *ras, unsigned int ncpus); + int ras_mce_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++int init_mce_priv(struct ras_events *ras); + + /* enables intel iMC logs */ + int set_intel_imc_log(enum cputype cputype, unsigned int ncpus); +@@ -170,4 +172,6 @@ int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e); + + int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e); + ++void report_mce_event(struct ras_events *ras, struct tep_record *record, ++ struct trace_seq *s, struct mce_event *e); + #endif +diff --git a/ras-record.h b/ras-record.h +index 2dd6630..eb5b838 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -28,6 +28,7 @@ struct ras_mc_event { + signed char top_layer, middle_layer, lower_layer; + unsigned long long address, grain, syndrome; + const char *driver_detail; ++ int erst; + }; + + struct ras_mc_offline_event { +@@ -46,6 +47,9 @@ struct ras_aer_event { + uint8_t tlp_header_valid; + uint32_t *tlp_header; + const char *msg; ++ int erst; ++ uint16_t vendor_id; ++ uint16_t device_id; + }; + + struct ras_extlog_event { +diff --git a/rasdaemon.c b/rasdaemon.c +index 6505dee..be5c390 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -10,6 +10,7 @@ + #include + #include + ++#include "ras-erst.h" + #include "ras-events.h" + #include "ras-logger.h" + #include "ras-poison-page-stat.h" +@@ -225,6 +226,16 @@ int main(int argc, char *argv[]) + if (daemon(0, 0)) + exit(EXIT_FAILURE); + ++#ifdef HAVE_ERST ++#ifdef HAVE_MCE ++ if (choices_disable && strlen(choices_disable) != 0 && ++ strstr(choices_disable, "ras:erst")) ++ log(ALL, LOG_INFO, "Disabled ras:erst from config\n"); ++ else ++ handle_erst(); ++#endif ++#endif ++ + handle_ras_events(args.record_events, args.enable_ipmitool); + + return 0; +-- +2.43.5 + diff --git a/1008-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch b/1008-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch deleted file mode 100644 index caee0f5803b859f69724ca93769c6091003de632..0000000000000000000000000000000000000000 --- a/1008-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch +++ /dev/null @@ -1,57 +0,0 @@ -From 5925333ff040bab348e2c8e439ba05421c307958 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Sat, 5 Mar 2022 16:18:55 +0000 -Subject: [PATCH 08/85] rasdaemon: ras-mc-ctl: Reformat error info of the - HiSilicon Kunpeng920 - -Reformat the code to display the error info of HiSilicon Kunpeng920. - -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - util/ras-mc-ctl.in | 15 +++++++++------ - 1 file changed, 9 insertions(+), 6 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index c4bef8f..00af3a7 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1671,8 +1671,9 @@ sub vendor_errors - $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); - $out .= "module_id=$module_id, " if ($module_id); - $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "err_severity=$err_severity, \n" if ($err_severity); -- $out .= "Error Registers: $regs\n\n" if ($regs); -+ $out .= "err_severity=$err_severity, " if ($err_severity); -+ $out .= "Error Registers: $regs " if ($regs); -+ $out .= "\n\n"; - } - if ($out ne "") { - print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n"; -@@ -1694,8 +1695,9 @@ sub vendor_errors - $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); - $out .= "module_id=$module_id, " if ($module_id); - $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "err_severity=$err_severity, \n" if ($err_severity); -- $out .= "Error Registers: $regs\n\n" if ($regs); -+ $out .= "err_severity=$err_severity, " if ($err_severity); -+ $out .= "Error Registers: $regs " if ($regs); -+ $out .= "\n\n"; - } - if ($out ne "") { - print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n"; -@@ -1719,8 +1721,9 @@ sub vendor_errors - $out .= "core_id=$core_id, " if ($core_id); - $out .= "port_id=$port_id, " if ($port_id); - $out .= "err_severity=$err_severity, " if ($err_severity); -- $out .= "err_type=$err_type, \n" if ($err_type); -- $out .= "Error Registers: $regs\n\n" if ($regs); -+ $out .= "err_type=$err_type, " if ($err_type); -+ $out .= "Error Registers: $regs " if ($regs); -+ $out .= "\n\n"; - } - if ($out ne "") { - print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n"; --- -2.33.1 - diff --git a/1009-aer-print-pci-device-name-and-vendor-device-id.patch b/1009-aer-print-pci-device-name-and-vendor-device-id.patch new file mode 100644 index 0000000000000000000000000000000000000000..ac5177eaf1b87638bb38f1f189dd066dcd024b01 --- /dev/null +++ b/1009-aer-print-pci-device-name-and-vendor-device-id.patch @@ -0,0 +1,166 @@ +From 5d8df52470036771ee97fa93ea0abcf3c3fbb3f3 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 27 Mar 2025 17:27:38 +0800 +Subject: [PATCH 09/30] aer: print pci device name and vendor/device id + +New aer log like follow: + + <...>-2682840 [125] .... 0.017661 aer_event 2025-03-27 +17:34:44 +0800 0000:99:00.0 (Intel Corporation Device 0b60 - +vendor_id: 0x8086 device_id: 0xb60) Data Link Protocol Uncorrected +(Non-Fatal) + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 4 ++-- + configure.ac | 8 ++++++++ + misc/rasdaemon.spec.in | 2 ++ + ras-aer-handler.c | 46 +++++++++++++++++++++++++++++++++++++++++- + ras-record.h | 2 +- + 5 files changed, 58 insertions(+), 4 deletions(-) + +diff --git a/Makefile.am b/Makefile.am +index e1bcda1..2911a21 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -123,8 +123,8 @@ if WITH_ERST + rasdaemon_SOURCES += ras-erst.c + endif + +-rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) +-rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) ++rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) ++rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS) + + include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ +diff --git a/configure.ac b/configure.ac +index 47e6346..3603c7f 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -54,6 +54,14 @@ AC_ARG_ENABLE([aer], + AS_IF([test "x$enable_aer" = "xyes" || test "x$enable_all" = "xyes"], [ + AC_DEFINE(HAVE_AER,1,"have PCIe AER events collect") + AC_SUBST([WITH_AER]) ++ ++ has_libpci_ver=0 ++ dnl check for pciutils library ++ PKG_CHECK_MODULES([LIBPCI], [libpci], [has_libpci_ver=1]) ++ ++ AS_IF([test "$has_libpci_ver" -eq 0], [ ++ AC_MSG_ERROR([libpci is required but were not found]) ++]) + ]) + AM_CONDITIONAL([WITH_AER], [test x$enable_aer = xyes || test x$enable_all = xyes]) + AM_COND_IF([WITH_AER], [USE_AER="yes"], [USE_AER="no"]) +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index 4cc859f..a30045c 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -17,10 +17,12 @@ BuildRequires: perl-generators + BuildRequires: sqlite-devel + BuildRequires: systemd + BuildRequires: libtraceevent-devel ++BuildRequires: pciutils-devel + Provides: bundled(kernel-event-lib) + Requires: hwdata + Requires: perl-DBD-SQLite + Requires: libtraceevent ++Requires: pciutils-devel + %ifarch %{ix86} x86_64 + Requires: dmidecode + %endif +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index 5d069f3..53acbc8 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -4,6 +4,7 @@ + * Copyright (C) 2013 Mauro Carvalho Chehab + */ + ++#include + #include + #include + #include +@@ -63,6 +64,45 @@ void ras_aer_handler_init(int enable_ipmitool) + + #define BUF_LEN 1024 + ++static void get_pci_dev_name(char *bdf, char *pci_name, ssize_t len, u16 *vendor_id, u16 *device_id) ++{ ++ struct pci_access *pacc; ++ struct pci_dev *dev; ++ struct pci_filter filter = {0}; ++ char *err; ++ ++ if (!pci_name) ++ return; ++ ++ pacc = pci_alloc(); ++ if (!pacc) ++ return; ++ ++ pci_init(pacc); ++ pci_scan_bus(pacc); ++ pci_filter_init(pacc, &filter); ++ err = pci_filter_parse_slot(&filter, bdf); ++ if (err) { ++ log(TERM, LOG_ERR, "Invalid PCI device name %s\n", bdf); ++ goto free; ++ } ++ ++ for (dev = pacc->devices; dev; dev = dev->next) { ++ if (pci_filter_match(&filter, dev)) { ++ pci_fill_info(dev, PCI_FILL_IDENT); ++ *vendor_id = dev->vendor_id; ++ *device_id = dev->device_id; ++ pci_lookup_name(pacc, pci_name, len, ++ PCI_LOOKUP_VENDOR | PCI_LOOKUP_DEVICE, ++ dev->vendor_id, dev->device_id); ++ break; ++ } ++ } ++ ++free: ++ pci_cleanup(pacc); ++} ++ + int ras_aer_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context) +@@ -75,7 +115,8 @@ int ras_aer_event_handler(struct trace_seq *s, + time_t now; + struct tm *tm; + struct ras_aer_event ev; +- char buf[BUF_LEN]; ++ char buf[BUF_LEN] = { 0 }; ++ uint16_t vendor_id = 0, device_id = 0; + #ifdef HAVE_AMP_NS_DECODE + char ipmi_add_sel[105]; + uint8_t sel_data[5]; +@@ -108,6 +149,9 @@ int ras_aer_event_handler(struct trace_seq *s, + return -1; + trace_seq_printf(s, "%s ", ev.dev_name); + ++ get_pci_dev_name(ev.dev_name, buf, sizeof(buf), &vendor_id, &device_id); ++ trace_seq_printf(s, "(%s - vendor_id: %#x device_id: %#x) ", buf, vendor_id, device_id); ++ + if (tep_get_field_val(s, event, "status", record, &status_val, 1) < 0) + return -1; + +diff --git a/ras-record.h b/ras-record.h +index eb5b838..ce7d12c 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -43,7 +43,7 @@ struct ras_mc_offline_event { + struct ras_aer_event { + char timestamp[64]; + const char *error_type; +- const char *dev_name; ++ char *dev_name; + uint8_t tlp_header_valid; + uint32_t *tlp_header; + const char *msg; +-- +2.43.5 + diff --git a/1009-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch b/1009-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch deleted file mode 100644 index c4392b2ebdd58fb3b509c38eebe12adf261f2781..0000000000000000000000000000000000000000 --- a/1009-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch +++ /dev/null @@ -1,38 +0,0 @@ -From a35999326f4063d8cb0ed3813a3938acca09f41b Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Sat, 5 Mar 2022 17:01:35 +0000 -Subject: [PATCH 09/85] rasdaemon: ras-mc-ctl: Add printing usage if necessary - parameters are not passed for the vendor-error options - -Add printing usage if necessary parameters are not passed -for the vendor-errors options. - -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - util/ras-mc-ctl.in | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 00af3a7..0311e59 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1544,6 +1544,7 @@ sub vendor_errors_summary - if ($num_args ne 0) { - $platform_id = $ARGV[0]; - } else { -+ usage(1); - return; - } - -@@ -1651,6 +1652,7 @@ sub vendor_errors - if ($num_args ne 0) { - $platform_id = $ARGV[0]; - } else { -+ usage(1); - return; - } - --- -2.33.1 - diff --git a/1010-rasdaemon-introduce-EDPC-config-in-rasdaemon.patch b/1010-rasdaemon-introduce-EDPC-config-in-rasdaemon.patch new file mode 100644 index 0000000000000000000000000000000000000000..b5a6922e71e6063526fabf05ebbf639d9d7add4f --- /dev/null +++ b/1010-rasdaemon-introduce-EDPC-config-in-rasdaemon.patch @@ -0,0 +1,332 @@ +From 921765e3ccd8333c5474000e409dfb0ec80c8f32 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 27 Mar 2025 17:45:16 +0800 +Subject: [PATCH 10/30] rasdaemon: introduce EDPC config in rasdaemon + +System with EDPC enabled device can recovery from fatal aer error. +Rasdaemon now helps users correctly configure EDPC functionality. + +Rasdaemon will enable EDPC for fatal error if PCIE_EDPC_ENABLE set +to 1. All device with EDPC capability will be enabled by default +if EDPC_DEVICE is specified, only the specified device will be +enabled. For example: + PCIE_EDPC_ENABLE=1 + EDPC_DEVICE=0000:01:00.0 +only enable device 0000:01:00.0. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 4 +- + misc/rasdaemon.env | 11 +++ + ras-pcie-edpc.c | 217 +++++++++++++++++++++++++++++++++++++++++++++ + ras-pcie-edpc.h | 9 ++ + rasdaemon.c | 5 ++ + 5 files changed, 244 insertions(+), 2 deletions(-) + create mode 100644 ras-pcie-edpc.c + create mode 100644 ras-pcie-edpc.h + +diff --git a/Makefile.am b/Makefile.am +index 2911a21..bb3d420 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -56,7 +56,7 @@ if WITH_SQLITE3 + rasdaemon_SOURCES += ras-record.c + endif + if WITH_AER +- rasdaemon_SOURCES += ras-aer-handler.c ++ rasdaemon_SOURCES += ras-aer-handler.c ras-pcie-edpc.c + endif + if WITH_NON_STANDARD + rasdaemon_SOURCES += ras-non-standard-handler.c +@@ -133,7 +133,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ + ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ + non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \ +- ras-poison-page-stat.h ras-erst.h ++ ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 193ee19..0516c9c 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -101,3 +101,14 @@ MC_CE_STAT_THRESHOLD=2000 + POISON_STAT_THRESHOLD=102400 + + ERST_DELETE=1 ++ ++# EDPC config ++# ++# rasdaemon will enable EDPC for fatal error if PCIE_EDPC_ENABLE set to 1 ++# All device with EDPC capability will be enabled by default, ++# if EDPC_DEVICE is specified, only the specified device will be enabled ++# For example: ++# PCIE_EDPC_ENABLE=1 ++# EDPC_DEVICE=0000:01:00.0 // only enable device 0000:01:00.0 ++PCIE_EDPC_ENABLE=0 ++EDPC_DEVICE= +diff --git a/ras-pcie-edpc.c b/ras-pcie-edpc.c +new file mode 100644 +index 0000000..4731b05 +--- /dev/null ++++ b/ras-pcie-edpc.c +@@ -0,0 +1,217 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "ras-pcie-edpc.h" ++#include "ras-logger.h" ++#include "types.h" ++ ++#define EDPC_DEVICE "EDPC_DEVICE" ++ ++#define PCI_EXP_DPC_CTL_EN_MASK 0x3 ++ ++static char *edpc_str[] = { ++ [PCI_EXP_DPC_CTL_EN_FATAL] = "Fatal Error", ++ [PCI_EXP_DPC_CTL_EN_NONFATAL] = "Non-Fatal Error", ++}; ++ ++static bool is_cxl_mem_or_cache(struct pci_dev *dev) ++{ ++ struct pci_cap *cap; ++ u32 hdr; ++ u16 vendor, cxl_cap, id; ++ ++ cap = pci_find_cap(dev, PCI_EXT_CAP_ID_DVSEC, PCI_CAP_EXTENDED); ++ if (!cap) ++ return false; ++ ++ hdr = pci_read_long(dev, cap->addr + PCI_DVSEC_HEADER1); ++ vendor = hdr & GENMASK(15, 0); ++ id = pci_read_word(dev, cap->addr + PCI_DVSEC_HEADER2); ++ if (vendor != PCI_DVSEC_VENDOR_ID_CXL || id != PCI_DVSEC_ID_CXL) ++ return false; ++ ++ cxl_cap = pci_read_word(dev, cap->addr + PCI_CXL_CAP); ++ if (cxl_cap & (PCI_CXL_CAP_CACHE | PCI_CXL_CAP_MEM)) ++ return true; ++ ++ return false; ++} ++ ++/** ++ * CXL 2.0 RAS spec: 4.2: ++ * Enabling eDPC is not recommended in most CXL 2.0 systems because eDPC ++ * containment flow brings the link down, disrupting CXL.cache and ++ * CXL.mem traffic which can lead to host timeouts. ++ */ ++static void cxl_check_rp(struct pci_dev *dev, struct pci_dev *dpc) ++{ ++ struct pci_dev *dev_p, *dpc_p; ++ for (dev_p = dev->parent; dev_p; dev_p = dev_p->parent) { ++ for (dpc_p = dpc->next; dpc_p; dpc_p = dpc_p->next) { ++ if (dev_p->domain == dpc_p->domain && ++ dev_p->bus == dpc_p->bus && ++ dev_p->dev == dpc_p->dev && ++ dev_p->func == dpc_p->func) { ++ dpc_p->aux = (void *)true; ++ log(TERM, LOG_INFO, "Device %x:%x:%x.%x is CXL RP, ignore EDPC config\n", ++ dpc_p->domain, dpc_p->bus, dpc_p->dev, dpc_p->func); ++ } ++ } ++ } ++} ++ ++static bool has_edpc(struct pci_dev *dev) ++{ ++ struct pci_cap *cap; ++ ++ pci_fill_info(dev, PCI_FILL_EXT_CAPS); ++ cap = pci_find_cap(dev, PCI_EXT_CAP_ID_DPC, PCI_CAP_EXTENDED); ++ if (!cap) ++ return false; ++ return true; ++} ++ ++static void set_edpc(struct pci_dev *dev) ++{ ++ struct pci_cap *cap; ++ u16 control; ++ int need_config = 0; ++ ++ cap = pci_find_cap(dev, PCI_EXT_CAP_ID_DPC, PCI_CAP_EXTENDED); ++ if (!cap) ++ return; ++ ++ control = pci_read_word(dev, cap->addr + PCI_EXP_DPC_CTL); ++ need_config = PCI_DPC_CTL_TRIGGER(control) == PCI_EXP_DPC_CTL_EN_FATAL ? 0 : 1; ++ log(TERM, LOG_INFO, "Device %x:%x:%x.%x origin EDPC %s and triggered for %s, %s need config\n", ++ dev->domain, dev->bus, dev->dev, dev->func, ++ (control & PCI_EXP_DPC_CTL_INT_EN) ? "enabled" : "disabled", ++ edpc_str[control & PCI_EXP_DPC_CTL_EN_MASK], ++ need_config ? "" : "not"); ++ ++ if (need_config) { ++ control &= PCI_EXP_DPC_CTL_EN_MASK; ++ control |= PCI_EXP_DPC_CTL_EN_FATAL; ++ pci_write_word(dev, cap->addr + PCI_EXP_DPC_CTL, control); ++ log(TERM, LOG_INFO, "Device %x:%x:%x.%x EDPC %s and triggered for %s\n", ++ dev->domain, dev->bus, dev->dev, dev->func, ++ (control & PCI_EXP_DPC_CTL_INT_EN) ? "enabled" : "disabled", ++ edpc_str[control & PCI_EXP_DPC_CTL_EN_MASK]); ++ } ++} ++ ++static struct pci_filter *config_pcie_edpc_device(struct pci_access *pacc, char *names, int *len) ++{ ++ int i; ++ struct pci_filter *filter = NULL; ++ char *token, *err, pci_names[MAX_PATH + 1]; ++ ++ strscpy(pci_names, names, sizeof(pci_names)); ++ for (i = 0; pci_names[i] != '\0'; i++) ++ if (pci_names[i] == ',') ++ (*len)++; ++ ++ filter = calloc(*len, sizeof(struct pci_filter)); ++ if (!filter) ++ return NULL; ++ ++ i = 0; ++ token = strtok(pci_names, ","); ++ while (token) { ++ pci_filter_init(pacc, &filter[i]); ++ err = pci_filter_parse_slot(&filter[i++], token); ++ if (err) { ++ free(filter); ++ log(TERM, LOG_ERR, "Invalid PCI device name %s\n", err); ++ return NULL; ++ } ++ token = strtok(NULL, ","); ++ } ++ ++ log(TERM, LOG_ERR, "Config PCIE EDPC for: %s\n", names); ++ ++ return filter; ++} ++ ++int config_pcie_edpc(void) ++{ ++ struct pci_access *pacc; ++ struct pci_dev *dev, *dev_head, *tmp; ++ int ret = 0, len = 1, i; ++ char *pci_names; ++ struct pci_filter *filter = NULL; ++ struct pci_dev dev_dpc_head = { 0 }; ++ ++ pacc = pci_alloc(); ++ if (!pacc) ++ return -1; ++ ++ pci_init(pacc); ++ pci_scan_bus(pacc); ++ ++ pci_names = getenv(EDPC_DEVICE); ++ if (pci_names && strlen(pci_names) != 0) { ++ filter = config_pcie_edpc_device(pacc, pci_names, &len); ++ if (!filter) ++ goto free; ++ } else { ++ len = 0; ++ } ++ ++ dev_head = pacc->devices; ++ for (dev = dev_head; dev; dev = dev->next) { ++ pci_fill_info(dev, PCI_FILL_PARENT); ++ if (has_edpc(dev)) { ++ tmp = malloc(sizeof(struct pci_dev)); ++ if (!tmp) { ++ ret = -1; ++ goto free; ++ } ++ ++ memcpy(tmp, dev, sizeof(struct pci_dev)); ++ tmp->next = dev_dpc_head.next; ++ dev_dpc_head.next = tmp; ++ } ++ } ++ ++ for (dev = dev_head; dev; dev = dev->next) ++ if (is_cxl_mem_or_cache(dev)) ++ cxl_check_rp(dev, &dev_dpc_head); ++ ++ for (dev = dev_dpc_head.next; dev; dev = dev->next) { ++ if (!dev->aux) { ++ if (len) { ++ for (i = 0; i < len; i++) { ++ if (pci_filter_match(&filter[i], dev)) { ++ set_edpc(dev); ++ break; ++ } ++ } ++ } else { ++ set_edpc(dev); ++ } ++ } ++ } ++ ++free: ++ while (dev_dpc_head.next) { ++ tmp = dev_dpc_head.next; ++ dev_dpc_head.next = tmp->next; ++ free(tmp); ++ } ++ ++ pci_cleanup(pacc); ++ free(filter); ++ return ret; ++} +diff --git a/ras-pcie-edpc.h b/ras-pcie-edpc.h +new file mode 100644 +index 0000000..a7b96a4 +--- /dev/null ++++ b/ras-pcie-edpc.h +@@ -0,0 +1,9 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++ #define PCIE_EDPC_ENABLE "PCIE_EDPC_ENABLE" ++ ++int config_pcie_edpc(void); +diff --git a/rasdaemon.c b/rasdaemon.c +index be5c390..3d4c2ec 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -16,6 +16,7 @@ + #include "ras-poison-page-stat.h" + #include "ras-record.h" + #include "ras-mc-handler.h" ++#include "ras-pcie-edpc.h" + #include "types.h" + + /* +@@ -235,6 +236,10 @@ int main(int argc, char *argv[]) + handle_erst(); + #endif + #endif ++ if (getenv(PCIE_EDPC_ENABLE) && atoi(getenv(PCIE_EDPC_ENABLE))) ++ config_pcie_edpc(); ++ else ++ log(TERM, LOG_INFO, "PCIE EDPC config is not enabled\n"); + + handle_ras_events(args.record_events, args.enable_ipmitool); + +-- +2.43.5 + diff --git a/1010-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch b/1010-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch deleted file mode 100644 index 4a785dfda5d15a8cc4cc1195bc76b7a1ded1be54..0000000000000000000000000000000000000000 --- a/1010-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch +++ /dev/null @@ -1,275 +0,0 @@ -From c79b29bc63d32b8f80782a9860de31cb20b2c289 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Sat, 5 Mar 2022 18:19:38 +0000 -Subject: [PATCH 10/85] rasdaemon: ras-mc-ctl: Add support to display the - HiSilicon vendor errors for a specified module - -Add support to display the HiSilicon vendor errors for a specified module. - -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - util/ras-mc-ctl.in | 145 +++++++++++++++++++++++++++------------------ - 1 file changed, 87 insertions(+), 58 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 0311e59..c23d93f 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -96,8 +96,9 @@ Usage: $prog [OPTIONS...] - --errors Shows the errors stored at the error database. - --error-count Shows the corrected and uncorrected error counts using sysfs. - --vendor-errors-summary Presents a summary of the vendor-specific logged errors. -- --vendor-errors Shows the vendor-specific errors stored in the error database. -- --vendor-platforms Shows the supported platforms with platform-ids for the vendor-specific errors. -+ --vendor-errors Shows the vendor-specific errors stored in the error database. -+ --vendor-errors Shows the vendor-specific errors for a specific module stored in the error database. -+ --vendor-platforms List the supported platforms with platform-ids for the vendor-specific errors. - --help This help message. - EOF - -@@ -1535,12 +1536,14 @@ use constant { - sub vendor_errors_summary - { - require DBI; -- my ($num_args, $platform_id); -+ my ($num_args, $platform_id, $found_platform); - my ($query, $query_handle, $count, $out); - my ($module_id, $sub_module_id, $err_severity, $err_sev); - - $num_args = $#ARGV + 1; - $platform_id = 0; -+ $found_platform = 0; -+ - if ($num_args ne 0) { - $platform_id = $ARGV[0]; - } else { -@@ -1552,6 +1555,7 @@ sub vendor_errors_summary - - # HiSilicon Kunpeng920 errors - if ($platform_id eq HISILICON_KUNPENG_920) { -+ $found_platform = 1; - $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -@@ -1615,6 +1619,7 @@ sub vendor_errors_summary - - # HiSilicon Kunpeng9xx common errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { -+ $found_platform = 1; - $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -@@ -1636,21 +1641,31 @@ sub vendor_errors_summary - $query_handle->finish; - } - -+ if ($platform_id && !($found_platform)) { -+ print "Platform ID $platform_id is not valid\n"; -+ } -+ - undef($dbh); - } - - sub vendor_errors - { - require DBI; -- my ($num_args, $platform_id); -+ my ($num_args, $platform_id, $found_platform, $module, $found_module); - my ($query, $query_handle, $id, $timestamp, $out); - my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id); - my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs); - - $num_args = $#ARGV + 1; - $platform_id = 0; -+ $found_platform = 0; -+ $module = 0; -+ $found_module = 0; - if ($num_args ne 0) { - $platform_id = $ARGV[0]; -+ if ($num_args gt 1) { -+ $module = $ARGV[1]; -+ } - } else { - usage(1); - return; -@@ -1660,27 +1675,29 @@ sub vendor_errors - - # HiSilicon Kunpeng920 errors - if ($platform_id eq HISILICON_KUNPENG_920) { -+ $found_platform = 1; - $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs)); - $out = ""; - while($query_handle->fetch()) { -- $out .= "$id. $timestamp Error Info: "; -- $out .= "version=$version, "; -- $out .= "soc_id=$soc_id, " if ($soc_id); -- $out .= "socket_id=$socket_id, " if ($socket_id); -- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -- $out .= "module_id=$module_id, " if ($module_id); -- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "err_severity=$err_severity, " if ($err_severity); -- $out .= "Error Registers: $regs " if ($regs); -- $out .= "\n\n"; -+ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { -+ $out .= "$id. $timestamp Error Info: "; -+ $out .= "version=$version, "; -+ $out .= "soc_id=$soc_id, " if ($soc_id); -+ $out .= "socket_id=$socket_id, " if ($socket_id); -+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -+ $out .= "module_id=$module_id, " if ($module_id); -+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -+ $out .= "err_severity=$err_severity, " if ($err_severity); -+ $out .= "Error Registers: $regs " if ($regs); -+ $out .= "\n\n"; -+ $found_module = 1; -+ } - } - if ($out ne "") { - print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng920 OEM type1 errors.\n"; - } - $query_handle->finish; - -@@ -1690,21 +1707,22 @@ sub vendor_errors - $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs)); - $out = ""; - while($query_handle->fetch()) { -- $out .= "$id. $timestamp Error Info: "; -- $out .= "version=$version, "; -- $out .= "soc_id=$soc_id, " if ($soc_id); -- $out .= "socket_id=$socket_id, " if ($socket_id); -- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -- $out .= "module_id=$module_id, " if ($module_id); -- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "err_severity=$err_severity, " if ($err_severity); -- $out .= "Error Registers: $regs " if ($regs); -- $out .= "\n\n"; -+ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { -+ $out .= "$id. $timestamp Error Info: "; -+ $out .= "version=$version, "; -+ $out .= "soc_id=$soc_id, " if ($soc_id); -+ $out .= "socket_id=$socket_id, " if ($socket_id); -+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -+ $out .= "module_id=$module_id, " if ($module_id); -+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -+ $out .= "err_severity=$err_severity, " if ($err_severity); -+ $out .= "Error Registers: $regs " if ($regs); -+ $out .= "\n\n"; -+ $found_module = 1; -+ } - } - if ($out ne "") { - print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng920 OEM type2 errors.\n"; - } - $query_handle->finish; - -@@ -1714,51 +1732,56 @@ sub vendor_errors - $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $sub_module_id, $core_id, $port_id, $err_severity, $err_type, $regs)); - $out = ""; - while($query_handle->fetch()) { -- $out .= "$id. $timestamp Error Info: "; -- $out .= "version=$version, "; -- $out .= "soc_id=$soc_id, " if ($soc_id); -- $out .= "socket_id=$socket_id, " if ($socket_id); -- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "core_id=$core_id, " if ($core_id); -- $out .= "port_id=$port_id, " if ($port_id); -- $out .= "err_severity=$err_severity, " if ($err_severity); -- $out .= "err_type=$err_type, " if ($err_type); -- $out .= "Error Registers: $regs " if ($regs); -- $out .= "\n\n"; -+ if ($module eq 0 || ($sub_module_id && uc($module) eq uc($sub_module_id))) { -+ $out .= "$id. $timestamp Error Info: "; -+ $out .= "version=$version, "; -+ $out .= "soc_id=$soc_id, " if ($soc_id); -+ $out .= "socket_id=$socket_id, " if ($socket_id); -+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -+ $out .= "core_id=$core_id, " if ($core_id); -+ $out .= "port_id=$port_id, " if ($port_id); -+ $out .= "err_severity=$err_severity, " if ($err_severity); -+ $out .= "err_type=$err_type, " if ($err_type); -+ $out .= "Error Registers: $regs " if ($regs); -+ $out .= "\n\n"; -+ $found_module = 1; -+ } - } - if ($out ne "") { - print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng920 PCIe controller errors.\n"; - } - $query_handle->finish; - } - - # HiSilicon Kunpeng9xx common errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { -+ $found_platform = 1; - $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs)); - $out = ""; - while($query_handle->fetch()) { -- $out .= "$id. $timestamp Error Info: "; -- $out .= "version=$version, "; -- $out .= "soc_id=$soc_id, " if ($soc_id); -- $out .= "socket_id=$socket_id, " if ($socket_id); -- $out .= "totem_id=$totem_id, " if ($totem_id); -- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -- $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id); -- $out .= "module_id=$module_id, " if ($module_id); -- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "core_id=$core_id, " if ($core_id); -- $out .= "port_id=$port_id, " if ($port_id); -- $out .= "err_type=$err_type, " if ($err_type); -- $out .= "pcie_info=$pcie_info, " if ($pcie_info); -- $out .= "err_severity=$err_severity, " if ($err_severity); -- $out .= "Error Registers: $regs" if ($regs); -- $out .= "\n\n"; -+ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { -+ $out .= "$id. $timestamp Error Info: "; -+ $out .= "version=$version, "; -+ $out .= "soc_id=$soc_id, " if ($soc_id); -+ $out .= "socket_id=$socket_id, " if ($socket_id); -+ $out .= "totem_id=$totem_id, " if ($totem_id); -+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -+ $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id); -+ $out .= "module_id=$module_id, " if ($module_id); -+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -+ $out .= "core_id=$core_id, " if ($core_id); -+ $out .= "port_id=$port_id, " if ($port_id); -+ $out .= "err_type=$err_type, " if ($err_type); -+ $out .= "pcie_info=$pcie_info, " if ($pcie_info); -+ $out .= "err_severity=$err_severity, " if ($err_severity); -+ $out .= "Error Registers: $regs" if ($regs); -+ $out .= "\n\n"; -+ $found_module = 1; -+ } - } - if ($out ne "") { - print "HiSilicon Kunpeng9xx common error events:\n$out\n"; -@@ -1768,6 +1791,12 @@ sub vendor_errors - $query_handle->finish; - } - -+ if ($platform_id && !($found_platform)) { -+ print "Platform ID $platform_id is not valid\n"; -+ } elsif ($module && !($found_module)) { -+ print "No error record for the module $module\n"; -+ } -+ - undef($dbh); - } - --- -2.33.1 - diff --git a/1011-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch b/1011-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch deleted file mode 100644 index 0d06290dd907412d8a85a87187791999fd72cc53..0000000000000000000000000000000000000000 --- a/1011-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch +++ /dev/null @@ -1,151 +0,0 @@ -From 6a73fdf7beed1dafe4ea33018e047e36ce796815 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Mon, 7 Mar 2022 12:38:45 +0000 -Subject: [PATCH 11/85] rasdaemon: ras-mc-ctl: Relocate reading and display - Kunpeng920 errors to under Kunpeng9xx - -Relocate reading and display Kunpeng920 errors to under Kunpeng9xx. - -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - util/ras-mc-ctl.in | 40 ++++++++++------------------------------ - 1 file changed, 10 insertions(+), 30 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index c23d93f..83ef9de 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1529,7 +1529,6 @@ sub errors - - # Definitions of the vendor platform IDs. - use constant { -- HISILICON_KUNPENG_920 => "Kunpeng920", - HISILICON_KUNPENG_9XX => "Kunpeng9xx", - }; - -@@ -1553,8 +1552,8 @@ sub vendor_errors_summary - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -- # HiSilicon Kunpeng920 errors -- if ($platform_id eq HISILICON_KUNPENG_920) { -+ # HiSilicon Kunpeng9xx errors -+ if ($platform_id eq HISILICON_KUNPENG_9XX) { - $found_platform = 1; - $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; - $query_handle = $dbh->prepare($query); -@@ -1570,9 +1569,7 @@ sub vendor_errors_summary - $out .= "\t$module_id: $count\n"; - } - if ($out ne "") { -- print "HiSilicon Kunpeng920 OEM type1 error events summary:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng920 OEM type1 errors.\n\n"; -+ print "HiSilicon Kunpeng9xx OEM type1 error events summary:\n$out\n"; - } - $query_handle->finish; - -@@ -1590,9 +1587,7 @@ sub vendor_errors_summary - $out .= "\t$module_id: $count\n"; - } - if ($out ne "") { -- print "HiSilicon Kunpeng920 OEM type2 error events summary:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng920 OEM type2 errors.\n\n"; -+ print "HiSilicon Kunpeng9xx OEM type2 error events summary:\n$out\n"; - } - $query_handle->finish; - -@@ -1610,16 +1605,10 @@ sub vendor_errors_summary - $out .= "\t$sub_module_id: $count\n"; - } - if ($out ne "") { -- print "HiSilicon Kunpeng920 PCIe controller error events summary:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng920 PCIe controller errors.\n\n"; -+ print "HiSilicon Kunpeng9xx PCIe controller error events summary:\n$out\n"; - } - $query_handle->finish; -- } - -- # HiSilicon Kunpeng9xx common errors -- if ($platform_id eq HISILICON_KUNPENG_9XX) { -- $found_platform = 1; - $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -@@ -1635,8 +1624,6 @@ sub vendor_errors_summary - } - if ($out ne "") { - print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng9xx common errors.\n\n"; - } - $query_handle->finish; - } -@@ -1673,8 +1660,8 @@ sub vendor_errors - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -- # HiSilicon Kunpeng920 errors -- if ($platform_id eq HISILICON_KUNPENG_920) { -+ # HiSilicon Kunpeng9xx errors -+ if ($platform_id eq HISILICON_KUNPENG_9XX) { - $found_platform = 1; - $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; - $query_handle = $dbh->prepare($query); -@@ -1697,7 +1684,7 @@ sub vendor_errors - } - } - if ($out ne "") { -- print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n"; -+ print "HiSilicon Kunpeng9xx OEM type1 error events:\n$out\n"; - } - $query_handle->finish; - -@@ -1722,7 +1709,7 @@ sub vendor_errors - } - } - if ($out ne "") { -- print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n"; -+ print "HiSilicon Kunpeng9xx OEM type2 error events:\n$out\n"; - } - $query_handle->finish; - -@@ -1749,14 +1736,10 @@ sub vendor_errors - } - } - if ($out ne "") { -- print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n"; -+ print "HiSilicon Kunpeng9xx PCIe controller error events:\n$out\n"; - } - $query_handle->finish; -- } - -- # HiSilicon Kunpeng9xx common errors -- if ($platform_id eq HISILICON_KUNPENG_9XX) { -- $found_platform = 1; - $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -@@ -1785,8 +1768,6 @@ sub vendor_errors - } - if ($out ne "") { - print "HiSilicon Kunpeng9xx common error events:\n$out\n"; -- } else { -- print "No HiSilicon Kunpeng9xx common errors.\n"; - } - $query_handle->finish; - } -@@ -1803,7 +1784,6 @@ sub vendor_errors - sub vendor_platforms - { - print "\nSupported platforms for the vendor-specific errors:\n"; -- print "\tHiSilicon Kunpeng920, platform-id=\"", HISILICON_KUNPENG_920, "\"\n"; - print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; - print "\n"; - } --- -2.33.1 - diff --git a/1011-rasdaemon-support-nvgpu-event.patch b/1011-rasdaemon-support-nvgpu-event.patch new file mode 100644 index 0000000000000000000000000000000000000000..b2ca511c696647fac28185033b4d7b564edf78f0 --- /dev/null +++ b/1011-rasdaemon-support-nvgpu-event.patch @@ -0,0 +1,511 @@ +From 0696914f490288081325b2a4425de1f0d45c4554 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 11 Apr 2025 13:30:10 +0800 +Subject: [PATCH 11/30] rasdaemon: support nvgpu event + +Use nvml library to report nvgpu event. New environment +NVGPU_DISABLE_EVENT indicate registered events. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 13 +++- + configure.ac | 11 +++ + contrib/nvml.py | 77 ++++++++++++++++++++ + misc/rasdaemon.env | 7 ++ + ras-nvgpu-nvml.c | 178 +++++++++++++++++++++++++++++++++++++++++++++ + ras-nvgpu.c | 54 ++++++++++++++ + ras-nvgpu.h | 14 ++++ + rasdaemon.c | 27 +++++++ + 9 files changed, 380 insertions(+), 2 deletions(-) + create mode 100644 contrib/nvml.py + create mode 100644 ras-nvgpu-nvml.c + create mode 100644 ras-nvgpu.c + create mode 100644 ras-nvgpu.h + +diff --git a/Makefile.am b/Makefile.am +index bb3d420..58ac082 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -17,10 +17,12 @@ EXTRA_DIST = \ + $(RSYSLOG_SERVICES_IN) \ + $(LOGROTATE_SERVICES_IN) \ + misc/rasdaemon.env \ ++ contrib/nvml.py \ + contrib/mc_event_trigger \ + contrib/mem_fail_trigger + + CLEANFILES= \ ++ ras-nvgpu-nvml.h \ + misc/ras-mc-ctl.service \ + misc/rasdaemon.service \ + misc/rasdaemon.syslog-ng \ +@@ -123,7 +125,14 @@ if WITH_ERST + rasdaemon_SOURCES += ras-erst.c + endif + +-rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) ++if WITH_NVGPU ++ BUILT_SOURCES = ras-nvgpu-nvml.h ++ras-nvgpu-nvml.h: contrib/nvml.py ++ python3 $< > $@ ++ rasdaemon_SOURCES += ras-nvgpu.c ras-nvgpu-nvml.c ++endif ++ ++rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl + rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS) + + include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ +@@ -133,7 +142,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ + ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ + non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \ +- ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h ++ ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h ras-nvgpu.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 3603c7f..43d845d 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -278,6 +278,16 @@ AS_IF([test "x$enable_erst" = "xyes" || test "x$enable_all" == "xyes"], [ + AM_CONDITIONAL([WITH_ERST], [test x$enable_erst = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_ERST], [USE_ERST="yes"], [USE_ERST="no"]) + ++AC_ARG_ENABLE([nvgpu], ++ AS_HELP_STRING([--enable-nvgpu], [enable NVGPU events])) ++ ++AS_IF([test "x$enable_nvgpu" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_NVGPU,1,"have NVGPU events collect") ++ AC_SUBST([WITH_NVGPU]) ++]) ++AM_CONDITIONAL([WITH_NVGPU], [test x$enable_nvgpu = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_NVGPU], [USE_NVGPU="yes"], [USE_NVGPU="no"]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +@@ -326,4 +336,5 @@ compile time options summary + JAGUAR RAS errors : $USE_JAGUAR_NS_DECODE + Signal : $USE_SIGNAL + ERST : $USE_ERST ++ NVGPU RAS errors : $USE_NVGPU + EOF +diff --git a/contrib/nvml.py b/contrib/nvml.py +new file mode 100644 +index 0000000..9f2c57d +--- /dev/null ++++ b/contrib/nvml.py +@@ -0,0 +1,77 @@ ++import re ++ ++PATH="/usr/local/cuda/include/nvml.h" ++func = ["nvmlInit", ++ "nvmlDeviceGetSupportedEventTypes", ++ "nvmlDeviceRegisterEvents", ++ "nvmlEventSetCreate", ++ "nvmlEventSetWait", ++ "nvmlDeviceGetCount", ++ "nvmlDeviceGetHandleByIndex", ++ "nvmlDeviceGetPciInfo", ++ "nvmlEventSetFree", ++ "nvmlShutdown"] ++ ++pattern = re.compile( ++ r'^nvmlReturn_t DECLDIR\s+({})(\(.*?\));'.format('|'.join(map(re.escape, func))), ++ flags=re.MULTILINE ++) ++ ++type_pattern = re.compile( ++ r'^#define\s+nvmlEventType(\w+)\s+0x.*', ++ flags=re.MULTILINE ++) ++ ++with open(PATH, 'r') as file: ++ content = file.read() ++ matched_lines = pattern.findall(content) ++ type_lines = type_pattern.findall(content) ++ ++func_declares = [] ++func_defs = [] ++func_inits = [] ++type_strs = [] ++ ++for match in matched_lines: ++ func_declares.append('typedef nvmlReturn_t (*my_{}_p){};'.format(match[0], match[1])) ++ func_defs.append('my_{}_p my_{};'.format(match[0], match[0])) ++ func_inits.append('my_{0} = (my_{0}_p)dlsym(handle, "{0}"); \ ++ \n\tif (!my_{0}) {{ \ ++ \n\t\tprintf(\"Failed to load {0}: %s\\n\", dlerror()); \ ++ \n\t\treturn -1; \ ++ \n\t}}'.format(match[0])) ++ ++for type_line in type_lines: ++ type_strs.append('case nvmlEventType{}: return \"{}\";'.format(type_line, type_line)) ++ ++print(''' ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++''' ++) ++print('#include \ ++ \n#include \ ++ \n#include "/usr/local/cuda/include/nvml.h"') ++print('\ntypedef const char* (*my_nvmlErrorString_p)(nvmlReturn_t result);') ++print('\n'.join(func_declares)) ++print('\nmy_nvmlErrorString_p my_nvmlErrorString;') ++print('\n'.join(func_defs)) ++print('\nstatic int my_nvml_setup(void* handle) \n{{\n\t{}{}\n\treturn 0;\n}}'.format('\n\t'.join(func_inits), ++ '\n\tmy_nvmlErrorString = (my_nvmlErrorString_p)dlsym(handle, "nvmlErrorString"); \ ++ \n\tif (!my_nvmlErrorString) { \ ++ \n\t\tprintf(\"Failed to load nvmlErrorString: %s\\n\", dlerror()); \ ++ \n\t\treturn -1; \ ++ \n\t}')) ++print('\nstatic const char* my_nvmlEventTypeString(unsigned long long type) \n{{ \ ++ \n\n\tswitch (type) {{ \ ++ \n\t{} \ ++ \n\tdefault: return \"Unknown\"; \ ++ \n\t}} \ ++ \n\treturn \"Unknown\"; \ ++ \n}}'.format('\n\t'.join(type_strs))) ++ ++ +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 0516c9c..60544f7 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -112,3 +112,10 @@ ERST_DELETE=1 + # EDPC_DEVICE=0000:01:00.0 // only enable device 0000:01:00.0 + PCIE_EDPC_ENABLE=0 + EDPC_DEVICE= ++ ++# Registered event type for nvgpu, default is ++# nvmlEventTypeAll & ~nvmlEventTypeClock ++# ref: https://docs.nvidia.com/deploy/nvml-api/group__nvmlEventType.html ++# For example: ++# NVGPU_DISABLE_EVENT="0x10" # disable nvmlEventTypeClock ++NVGPU_DISABLE_EVENT="0x10" +diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c +new file mode 100644 +index 0000000..aabe8f9 +--- /dev/null ++++ b/ras-nvgpu-nvml.c +@@ -0,0 +1,178 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#include ++#include ++#include ++ ++#include "ras-logger.h" ++#include "ras-nvgpu-nvml.h" ++#include "ras-nvgpu.h" ++#include "trace-seq.h" ++#include "types.h" ++ ++#define XID_EVENT_NAME "xid" ++ ++const char *lib_name[] = { ++ "/lib64/libnvidia-ml.so", ++ "/lib64/libnvidia-ml.so.1", ++ "/usr/local/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so", ++ "/usr/local/cuda/targets/sbsa-linux/lib/stubs/libnvidia-ml.so" ++}; ++ ++static void *find_lib(void) ++{ ++ void *handle = NULL; ++ ++ for (int i = 0; i < ARRAY_SIZE(lib_name); i++) { ++ handle = dlopen(lib_name[i], RTLD_LAZY); ++ if (handle) ++ return handle; ++ } ++ ++ log(ALL, LOG_ERR, "Failed to load libnvidia-ml\n"); ++ return NULL; ++} ++ ++static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) ++{ ++ struct trace_seq s; ++ nvmlPciInfo_t pci; ++ time_t now; ++ struct tm *tm; ++ char timestamp[64]; ++ ++ time(&now); ++ tm = localtime(&now); ++ ++ if (tm) ++ strftime(timestamp, sizeof(timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ ++ my_nvmlDeviceGetPciInfo(data->device, &pci); ++ ++ trace_seq_init(&s); ++ if (data->eventType == nvmlEventTypeXidCriticalError) { ++ trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", ++ "<...>", 0, -1, "....", 0.0f, XID_EVENT_NAME); ++ trace_seq_printf(&s, "%s ", timestamp); ++ trace_seq_printf(&s, "xid: %lld ", data->eventData); ++ } else { ++ trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", ++ "<...>", 0, -1, "....", 0.0f, NVGPU_EVENT_NAME); ++ trace_seq_printf(&s, "%s ", timestamp); ++ trace_seq_printf(&s, "event_type: %s(%llx) ", my_nvmlEventTypeString(data->eventType), data->eventType); ++ trace_seq_printf(&s, "data: %lld ", data->eventData); ++ } ++ ++ trace_seq_printf(&s, "pci_port: " NVML_DEVICE_PCI_BUS_ID_FMT " ", NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci)); ++ trace_seq_printf(&s, "gpu-i: %x ", data->gpuInstanceId); ++ trace_seq_printf(&s, "gpu-ci: %x ", data->computeInstanceId); ++ ++ trace_seq_terminate(&s); ++ trace_seq_do_printf(&s); ++ printf("\n"); ++ fflush(stdout); ++ trace_seq_destroy(&s); ++ ++ return 0; ++} ++ ++int ras_nvgpu_nvml_handle(void) ++{ ++ void *nvml_handle; ++ nvmlReturn_t ret; ++ unsigned int device_count; ++ nvmlDevice_t *devices; ++ nvmlEventSet_t event_set; ++ char *event_types_str = NULL; ++ unsigned long long disable = 0, event_types = 0; ++ nvmlEventData_t event_data; ++ ++ nvml_handle = find_lib(); ++ if (!nvml_handle) { ++ log(ALL, LOG_ERR, "Failed to load libnvidia-ml: %s\n", dlerror()); ++ return 1; ++ } ++ ++ if (my_nvml_setup(nvml_handle)) { ++ log(ALL, LOG_ERR, "Failed to setup libnvidia-ml\n"); ++ dlclose(nvml_handle); ++ return 1; ++ } ++ ++ ret = my_nvmlInit(); ++ if (ret) { ++ log(ALL, LOG_ERR, "NVML Init failed: %s\n", my_nvmlErrorString(ret)); ++ goto free_dl; ++ } ++ ++ ret = my_nvmlDeviceGetCount(&device_count); ++ if (ret) { ++ log(ALL, LOG_ERR, "Get device count failed: %s\n", my_nvmlErrorString(ret)); ++ goto free_nvml; ++ } ++ ++ devices = malloc(device_count * sizeof(nvmlDevice_t)); ++ if (!devices) { ++ log(ALL, LOG_ERR, "Failed to allocate memory for devices\n"); ++ goto free_nvml; ++ } ++ ++ for (unsigned int i = 0; i < device_count; i++) { ++ ret = my_nvmlDeviceGetHandleByIndex(i, &devices[i]); ++ if (ret) { ++ log(ALL, LOG_ERR, "Get device handle failed: %s\n", my_nvmlErrorString(ret)); ++ goto free_dev; ++ } ++ } ++ ++ ret = my_nvmlEventSetCreate(&event_set); ++ if (ret) { ++ log(ALL, LOG_ERR, "Create event set failed: %s\n", my_nvmlErrorString(ret)); ++ goto free_dev; ++ } ++ ++ event_types_str = getenv("NVGPU_DISABLE_EVENT"); ++ if (event_types_str) { ++ disable = strtoull(event_types_str, NULL, 0); ++ log(ALL, LOG_INFO, "Disable NVGPU events %s\n", my_nvmlEventTypeString(disable)); ++ } ++ ++ for (unsigned int i = 0; i < device_count; i++) { ++ ret = my_nvmlDeviceGetSupportedEventTypes(devices[i], &event_types); ++ if (ret) { ++ log(ALL, LOG_ERR, "Get support events failed: %s\n", my_nvmlErrorString(ret)); ++ goto free_event; ++ } ++ ++ ret = my_nvmlDeviceRegisterEvents(devices[i], event_types & ~disable, event_set); ++ if (ret) { ++ log(ALL, LOG_ERR, "Register events failed: %s\n", my_nvmlErrorString(ret)); ++ goto free_event; ++ } ++ } ++ ++ while (1) { ++ ret = my_nvmlEventSetWait(event_set, &event_data, -1); ++ if (!ret) ++ report_ras_gpu_nvml(&event_data, devices); ++ else { ++ log(ALL, LOG_ERR, "Wait for event failed: %s\n", my_nvmlErrorString(ret)); ++ break; ++ } ++ } ++ ++free_event: ++ my_nvmlEventSetFree(event_set); ++free_dev: ++ free(devices); ++free_nvml: ++ my_nvmlShutdown(); ++free_dl: ++ dlclose(nvml_handle); ++ return ret; ++} +diff --git a/ras-nvgpu.c b/ras-nvgpu.c +new file mode 100644 +index 0000000..5c63279 +--- /dev/null ++++ b/ras-nvgpu.c +@@ -0,0 +1,54 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "ras-events.h" ++#include "ras-logger.h" ++#include "ras-nvgpu.h" ++void *ras_nvgpu_handle(void *arg) ++{ ++ (void)arg; ++ sigset_t set; ++ struct stat st; ++ int retry = 3; ++ ++ if (stat("/dev/nvidia0", &st) == -1) { ++ log(ALL, LOG_WARNING, "NVIDIA device not found: %s\n", strerror(errno)); ++ return NULL; ++ } ++ if (!S_ISCHR(st.st_mode)) { ++ log(ALL, LOG_WARNING, "NVIDIA device is not a character device\n"); ++ return NULL; ++ } ++ ++ sigemptyset(&set); ++ sigaddset(&set, SIGINT); ++ sigaddset(&set, SIGTERM); ++ sigaddset(&set, SIGHUP); ++ sigaddset(&set, SIGQUIT); ++ if (pthread_sigmask(SIG_BLOCK, &set, NULL) != 0) { ++ log(ALL, LOG_ERR, "Failed to set thread signal mask\n"); ++ return NULL; ++ } ++ ++ while (retry--) { ++ if (ras_nvgpu_nvml_handle()) { ++ log(ALL, LOG_ERR, "NVGPU handle retry %d\n", retry); ++ sleep(10); ++ } ++ } ++ ++ log(ALL, LOG_ERR, "NVGPU handle fail, exit from nvgpu thread\n"); ++ ++ return NULL; ++} +diff --git a/ras-nvgpu.h b/ras-nvgpu.h +new file mode 100644 +index 0000000..32827ad +--- /dev/null ++++ b/ras-nvgpu.h +@@ -0,0 +1,14 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#ifndef __RAS_NVGPU_H ++#define __RAS_NVGPU_H ++ ++#define NVGPU_EVENT_NAME "nvgpu" ++ ++void *ras_nvgpu_handle(void *arg); ++int ras_nvgpu_nvml_handle(void); ++#endif +diff --git a/rasdaemon.c b/rasdaemon.c +index 3d4c2ec..9c5f9dd 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -5,6 +5,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -17,6 +18,7 @@ + #include "ras-record.h" + #include "ras-mc-handler.h" + #include "ras-pcie-edpc.h" ++#include "ras-nvgpu.h" + #include "types.h" + + /* +@@ -241,7 +243,32 @@ int main(int argc, char *argv[]) + else + log(TERM, LOG_INFO, "PCIE EDPC config is not enabled\n"); + ++#ifdef HAVE_NVGPU ++ pthread_t nvgpu_thread = 0, main_thread = pthread_self(); ++ bool nvgpu_enable = true; ++ ++ if (choices_disable && strlen(choices_disable) != 0 && ++ strstr(choices_disable, NVGPU_EVENT_NAME)) { ++ nvgpu_enable = false; ++ log(ALL, LOG_INFO, "Disable nvgpu event.\n"); ++ } ++ ++ if (nvgpu_enable) { ++ if (pthread_create(&nvgpu_thread, NULL, ras_nvgpu_handle, &main_thread) != 0) { ++ log(ALL, LOG_ERR, "Failed to create XID thread\n"); ++ pthread_cancel(nvgpu_thread); ++ exit(EXIT_FAILURE); ++ } ++ pthread_detach(nvgpu_thread); ++ log(ALL, LOG_INFO, "Create pthread to handle NVGPU events.\n"); ++ } ++#endif + handle_ras_events(args.record_events, args.enable_ipmitool); + ++#ifdef HAVE_NVGPU ++ if (nvgpu_enable) ++ pthread_cancel(nvgpu_thread); ++#endif ++ + return 0; + } +-- +2.43.5 + diff --git a/1075-rasdaemon-enhance-rasdaemon-event-trigger.patch b/1012-rasdaemon-enhance-rasdaemon-event-trigger.patch similarity index 82% rename from 1075-rasdaemon-enhance-rasdaemon-event-trigger.patch rename to 1012-rasdaemon-enhance-rasdaemon-event-trigger.patch index 76271e0d59410e1b315df5bbd14d7ab22651919e..4292bb075257cf6d4c26d11b711444c40a51635d 100644 --- a/1075-rasdaemon-enhance-rasdaemon-event-trigger.patch +++ b/1012-rasdaemon-enhance-rasdaemon-event-trigger.patch @@ -1,7 +1,7 @@ -From faaca73f3cb70ab4baba9d00eefe4d9cde14e033 Mon Sep 17 00:00:00 2001 +From 9163f3cd0f9344aacf8eb4b061f3ea2269f6c0cb Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Fri, 7 Jun 2024 11:26:06 +0800 -Subject: [PATCH 75/85] rasdaemon: enhance rasdaemon event trigger +Subject: [PATCH 12/30] rasdaemon: enhance rasdaemon event trigger - Add trigger timeout to avoid trigger hang. - Move all trigger code to trigger.c @@ -14,44 +14,61 @@ delete timeout. Signed-off-by: Ruidong Tian --- - Makefile.am | 2 +- - contrib/aer_trigger | 18 ++ - contrib/mce_record_trigger | 36 ++++ - contrib/mem_fail_trigger | 16 ++ - contrib/mem_fail_trigger.sh | 12 -- - misc/rasdaemon.env | 26 +++ + Makefile.am | 6 +- + contrib/aer_trigger | 27 +++ + contrib/mc_event_trigger | 9 + + contrib/mce_record_trigger | 46 +++++ + contrib/mem_fail_trigger | 21 +- + misc/rasdaemon.env | 23 ++- ras-aer-handler.c | 3 + - ras-events.c | 16 -- - ras-mc-handler.c | 90 +------- - ras-mce-handler.c | 8 +- + ras-events.c | 18 -- + ras-mc-handler.c | 89 +-------- + ras-mce-handler.c | 3 + ras-memory-failure-handler.c | 55 +---- - trigger.c | 387 ++++++++++++++++++++++++++++++++--- + trigger.c | 376 ++++++++++++++++++++++++++++++++--- trigger.h | 19 +- - 13 files changed, 484 insertions(+), 204 deletions(-) + 13 files changed, 493 insertions(+), 202 deletions(-) create mode 100755 contrib/aer_trigger create mode 100755 contrib/mce_record_trigger - create mode 100755 contrib/mem_fail_trigger - delete mode 100755 contrib/mem_fail_trigger.sh diff --git a/Makefile.am b/Makefile.am -index 91aab1e..fb0248e 100644 +index 58ac082..72f30b4 100644 --- a/Makefile.am +++ b/Makefile.am -@@ -106,4 +106,4 @@ install-data-local: +@@ -18,8 +18,7 @@ EXTRA_DIST = \ + $(LOGROTATE_SERVICES_IN) \ + misc/rasdaemon.env \ + contrib/nvml.py \ +- contrib/mc_event_trigger \ +- contrib/mem_fail_trigger ++ contrib/*_trigger + + CLEANFILES= \ + ras-nvgpu-nvml.h \ +@@ -171,8 +170,6 @@ install-data-local: $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d" $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/triggers" - $(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon" + install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon" - $(install_sh) @abs_srcdir@/contrib/mc_event_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mc_event_trigger" +- $(install_sh) @abs_srcdir@/contrib/mem_fail_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mem_fail_trigger" + if [ -d "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d" ]; then \ + install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.syslog-ng "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/rasdaemon.conf"; \ + fi +@@ -182,3 +179,4 @@ install-data-local: + if [ -d "$(DESTDIR)@sysconfdir@/logrotate.d" ]; then \ + install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.logrotate "$(DESTDIR)@sysconfdir@/logrotate.d/rasdaemon"; \ + fi + $(install_sh) @abs_srcdir@/contrib/*_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/" diff --git a/contrib/aer_trigger b/contrib/aer_trigger new file mode 100755 -index 0000000..982ff01 +index 0000000..87f9da9 --- /dev/null +++ b/contrib/aer_trigger -@@ -0,0 +1,18 @@ +@@ -0,0 +1,27 @@ +#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 +# This shell script can be executed by rasdaemon in daemon mode when a -+# memory_failure_event is occured, environment variables include all ++# memory_failure_event is occurred, environment variables include all +# information reported by tracepoint. + +# environment: @@ -65,17 +82,44 @@ index 0000000..982ff01 + +[ -x ./aer_trigger.local ] && . ./aer_trigger.local + ++if [ -d aer_trigger.extern ] ++then ++ ls aer_trigger.extern | ++ while read item ++ do ++ [ -x ./aer_trigger.extern/$item ] && . ./aer_trigger.extern/$item ++ done ++fi + +exit 0 +diff --git a/contrib/mc_event_trigger b/contrib/mc_event_trigger +index 9862595..5c68b56 100755 +--- a/contrib/mc_event_trigger ++++ b/contrib/mc_event_trigger +@@ -23,4 +23,13 @@ + + [ -x ./mc_event_trigger.local ] && . ./mc_event_trigger.local + ++if [ -d mc_event_trigger.extern ] ++then ++ ls mc_event_trigger.extern | ++ while read item ++ do ++ [ -x ./mc_event_trigger.extern/$item ] && . ./mc_event_trigger.extern/$item ++ done ++fi ++ + exit 0 diff --git a/contrib/mce_record_trigger b/contrib/mce_record_trigger new file mode 100755 -index 0000000..06a52d9 +index 0000000..ca49e6d --- /dev/null +++ b/contrib/mce_record_trigger -@@ -0,0 +1,36 @@ +@@ -0,0 +1,46 @@ +#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 +# This shell script can be executed by rasdaemon in daemon mode when a -+# mc_event is occured, environment variables include all information ++# mc_event is occurred, environment variables include all information +# reported by tracepoint. +# +# environment: @@ -108,73 +152,73 @@ index 0000000..06a52d9 + +[ -x ./mce_record_trigger.local ] && . ./mce_record_trigger.local + ++if [ -d mce_record_trigger.extern ] ++then ++ ls mce_record_trigger.extern | ++ while read item ++ do ++ [ -x ./mce_record_trigger.extern/$item ] && . ./mce_record_trigger.extern/$item ++ done ++fi ++ +exit 0 diff --git a/contrib/mem_fail_trigger b/contrib/mem_fail_trigger -new file mode 100755 -index 0000000..ee44227 ---- /dev/null +index d75ce50..f63df91 100755 +--- a/contrib/mem_fail_trigger +++ b/contrib/mem_fail_trigger -@@ -0,0 +1,16 @@ -+#!/bin/sh -+# This shell script can be executed by rasdaemon in daemon mode when a -+# memory_failure_event is occured, environment variables include all -+# information reported by tracepoint. +@@ -1,14 +1,25 @@ + #!/bin/sh + # SPDX-License-Identifier: GPL-2.0 +-# + # This shell script can be executed by rasdaemon in daemon mode when a + # memory_failure_event is occured, environment variables include all + # information reported by tracepoint. + +# environment: +# TIMESTAMP Timestamp when error occurred +# PFN Offlined page PFN +# PAGE_TYPE Page type +# ACTION_RESULT Action result -+# -+ -+[ -x ./mf_trigger.local ] && . ./mf_trigger.local -+ -+ -+exit 0 -diff --git a/contrib/mem_fail_trigger.sh b/contrib/mem_fail_trigger.sh -deleted file mode 100755 -index a3ac362..0000000 ---- a/contrib/mem_fail_trigger.sh -+++ /dev/null -@@ -1,12 +0,0 @@ --#!/bin/sh --# This shell script can be executed by rasdaemon in daemon mode when a --# memory_failure_event is occured, environment variables include all --# information reported by tracepoint. --# -- + # + -echo TIMESTAMP: $TIMESTAMP -echo PFN: $PFN -echo PAGE_TYPE: $PAGE_TYPE -echo ACTION_RESULT: $ACTION_RESULT -- --exit 0 ++[ -x ./mf_trigger.local ] && . ./mf_trigger.local ++ ++if [ -d mf_trigger.extern ] ++then ++ ls mf_trigger.extern | ++ while read item ++ do ++ [ -x ./mf_trigger.extern/$item ] && . ./mf_trigger.extern/$item ++ done ++fi + + exit 0 diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 3389a73..9293038 100644 +index 60544f7..1f5da55 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env -@@ -55,8 +55,34 @@ TRIGGER_DIR= +@@ -83,11 +83,30 @@ TRIGGER_DIR= # Execute these triggers when the mc_event occured, the triggers will not # be executed if the trigger is not specified. +# You can set timeout for trigger, trigger thread will be killed if timeout. +# The default timeout is 1, if you do not want any timeout, set it to 0. # For example: - # MC_CE_TRIGGER=mc_event_trigger +-# MC_CE_TRIGGER=mc_event_trigger # MC_UE_TRIGGER=mc_event_trigger -+# MC_CE_TRIGGER_TIMEOUT=1 +-MC_CE_TRIGGER= +# MC_UE_TRIGGER_TIMEOUT=1 + +# trigger for mc_event - MC_CE_TRIGGER= MC_UE_TRIGGER= -+MC_CE_TRIGGER_TIMEOUT=0 +MC_UE_TRIGGER_TIMEOUT=0 + -+MCE_CE_TRIGGER= +MCE_DE_TRIGGER= +MCE_UE_TRIGGER= -+MCE_CE_TRIGGER_TIMEOUT=0 +MCE_DE_TRIGGER_TIMEOUT=0 +MCE_UE_TRIGGER_TIMEOUT=0 + @@ -187,21 +231,23 @@ index 3389a73..9293038 100644 +AER_CE_TRIGGER_TIMEOUT=0 +AER_UE_TRIGGER_TIMEOUT=0 +AER_FATAL_TRIGGER_TIMEOUT=0 -+ + + # CE Statistic Threshold + # diff --git a/ras-aer-handler.c b/ras-aer-handler.c -index 40c60bb..b00703e 100644 +index 53acbc8..471ad9f 100644 --- a/ras-aer-handler.c +++ b/ras-aer-handler.c -@@ -25,6 +25,7 @@ - #include "ras-logger.h" - #include "bitfield.h" +@@ -17,6 +17,7 @@ #include "ras-report.h" + #include "unified-sel.h" + #include "types.h" +#include "trigger.h" /* bit field meaning for correctable error */ static const char *aer_cor_errors[32] = { -@@ -183,5 +184,7 @@ int ras_aer_event_handler(struct trace_seq *s, - system(ipmi_add_sel); +@@ -254,5 +255,7 @@ int ras_aer_event_handler(struct trace_seq *s, + return -1; #endif + run_aer_event_trigger(&ev); @@ -209,22 +255,24 @@ index 40c60bb..b00703e 100644 return 0; } diff --git a/ras-events.c b/ras-events.c -index 2411cf7..f944847 100644 +index d42ed9f..06f9a37 100644 --- a/ras-events.c +++ b/ras-events.c -@@ -64,11 +64,6 @@ +@@ -54,13 +54,6 @@ - extern char* choices_disable; + char *choices_disable; -static const struct event_trigger event_triggers[] = { - { "mc_event", &mc_event_trigger_setup }, +-#ifdef HAVE_MEMORY_FAILURE - { "memory_failure_event", &mem_fail_event_trigger_setup }, +-#endif -}; - static int get_debugfs_dir(char *tracing_dir, size_t len) { FILE *fp; -@@ -315,17 +310,6 @@ free_ras: +@@ -328,17 +321,6 @@ free_ras: return 0; } @@ -239,24 +287,16 @@ index 2411cf7..f944847 100644 - } -} - + #ifdef HAVE_DISKERROR + #if (!defined(HAVE_BLK_RQ_ERROR)) || defined(HAVE_SIGNAL) /* - * Set kernel filter. libtrace doesn't provide an API for setting filters - * in kernel, we have to implement it here. diff --git a/ras-mc-handler.c b/ras-mc-handler.c -index c438771..a270637 100644 +index 7a18f73..a729d93 100644 --- a/ras-mc-handler.c +++ b/ras-mc-handler.c -@@ -23,6 +23,7 @@ - #include - #include - #include "libtrace/kbuffer.h" -+#include - #include "ras-mc-handler.h" - #include "ras-logger.h" - #include "ras-page-isolation.h" -@@ -30,89 +31,6 @@ - #include "ras-report.h" +@@ -20,89 +20,6 @@ #include "trigger.h" + #include "types.h" -#define MAX_ENV 30 -static const char *mc_ce_trigger = NULL; @@ -341,10 +381,10 @@ index c438771..a270637 100644 - free(env[i]); -} - - int ras_mc_event_handler(struct trace_seq *s, - struct pevent_record *record, - struct event_format *event, void *context) -@@ -282,11 +200,7 @@ int ras_mc_event_handler(struct trace_seq *s, + static unsigned long long per_sec_ce_count; + unsigned long long mc_ce_stat_threshold; + static time_t cur; +@@ -312,11 +229,7 @@ int ras_mc_event_handler(struct trace_seq *s, ras_report_mc_event(ras, &ev); #endif @@ -358,35 +398,18 @@ index c438771..a270637 100644 return 0; diff --git a/ras-mce-handler.c b/ras-mce-handler.c -index f7ab23e..9601704 100644 +index 3d8d97d..92c5339 100644 --- a/ras-mce-handler.c +++ b/ras-mce-handler.c -@@ -14,8 +14,8 @@ - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -- */ -- -+*/ -+#define _GNU_SOURCE - #include - #include - #include -@@ -23,11 +23,13 @@ - #include - #include - #include -+#include - #include "libtrace/kbuffer.h" +@@ -17,6 +17,7 @@ #include "ras-mce-handler.h" - #include "types.h" - #include "ras-logger.h" #include "ras-report.h" + #include "types.h" +#include "trigger.h" /* - * The code below were adapted from Andi Kleen/Intel/SuSe mcelog code, -@@ -579,5 +581,7 @@ int ras_mce_event_handler(struct trace_seq *s, + * The code below were adapted from Andi Kleen/Intel/SUSE mcelog code, +@@ -598,5 +599,7 @@ int ras_mce_event_handler(struct trace_seq *s, ras_report_mce_event(ras, &e); #endif @@ -395,10 +418,10 @@ index f7ab23e..9601704 100644 return 0; } diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c -index df427b1..8bc7a9d 100644 +index d4c293b..0f4e937 100644 --- a/ras-memory-failure-handler.c +++ b/ras-memory-failure-handler.c -@@ -94,59 +94,6 @@ static const struct { +@@ -87,59 +87,6 @@ static const struct { { MF_RECOVERED, "Recovered" }, }; @@ -458,7 +481,7 @@ index df427b1..8bc7a9d 100644 static const char *get_page_type(int page_type) { unsigned int i; -@@ -225,7 +172,7 @@ int ras_memory_failure_event_handler(struct trace_seq *s, +@@ -222,7 +169,7 @@ int ras_memory_failure_event_handler(struct trace_seq *s, /* Report event to ABRT */ ras_report_mf_event(ras, &ev); #endif @@ -468,21 +491,22 @@ index df427b1..8bc7a9d 100644 return 0; } diff --git a/trigger.c b/trigger.c -index 3031f4b..334d945 100644 +index aa19a22..a13fffd 100644 --- a/trigger.c +++ b/trigger.c -@@ -2,54 +2,387 @@ +@@ -3,56 +3,378 @@ + #define _GNU_SOURCE #include - #include #include +#include #include + #include + #include "ras-logger.h" +#include "types.h" #include "trigger.h" -void run_trigger(const char *trigger, char *argv[], char **env, const char *reporter) -+#include "ras-event.c" +#include "ras-mce-handler.h" + +#define MAX_ENV 30 @@ -517,11 +541,17 @@ index 3031f4b..334d945 100644 + log(ALL, LOG_ERR, "Cannot create process for trigger\n"); return; + } else if (child == 0) { -+ if (execve(path, argv, env) == -1) { ++ if (execve(path, argv, env) == -1) + log(ALL, LOG_ERR, "Trigger %s exec fail: %s\n", path, strerror(errno)); -+ } + _exit(EXIT_FAILURE); ++ } ++ ++ signal(SIGCHLD, child_handler); ++ if (timeout) { ++ signal(SIGALRM, alarm_handler); ++ alarm(timeout); } ++ pause(); - if (child == 0) { - execve(trigger, argv, env); @@ -534,13 +564,6 @@ index 3031f4b..334d945 100644 - } else if (WIFSIGNALED(status)) { - log(SYSLOG, LOG_INFO, "Trigger %s killed by signal %d", - trigger, WTERMSIG(status)); -+ signal(SIGCHLD, child_handler); -+ if (timeout) { -+ signal(SIGALRM, alarm_handler); -+ alarm(timeout); -+ } -+ pause(); -+ + if (child_done) { + if (waitpid(child, &status, WNOHANG) == child) { + if (WIFEXITED(status) && WEXITSTATUS(status)) @@ -564,7 +587,10 @@ index 3031f4b..334d945 100644 -const char *trigger_check(const char *s) +int trigger_check(struct event_trigger *t) -+{ + { +- char *name; +- int rc; +- char *trigger_dir = getenv("TRIGGER_DIR"); + if (trigger_dir) + if (snprintf(t->abs_path, 256, "%s/%s", trigger_dir, t->path) < 0) + return -1; @@ -572,13 +598,15 @@ index 3031f4b..334d945 100644 + return access(t->abs_path, R_OK | X_OK); +} + -+struct event_trigger mc_ce_trigger = {"mc_event", "MC_CE_TRIGGER"}; +struct event_trigger mc_ue_trigger = {"mc_event", "MC_UE_TRIGGER"}; + -+struct event_trigger mce_ce_trigger = {"mce_record", "MCE_CE_TRIGGER"}; +struct event_trigger mce_de_trigger = {"mce_record", "MCE_DE_TRIGGER"}; +struct event_trigger mce_ue_trigger = {"mce_record", "MCE_UE_TRIGGER"}; -+ + +- if (trigger_dir) { +- if (asprintf(&name, "%s/%s", trigger_dir, s) < 0) +- return NULL; +- s = name; +struct event_trigger mf_trigger = {"memory_failure_event", "MEM_FAIL_TRIGGER"}; + +struct event_trigger aer_ce_trigger = {"aer_event", "AER_CE_TRIGGER"}; @@ -586,10 +614,8 @@ index 3031f4b..334d945 100644 +struct event_trigger aer_fatal_trigger = {"aer_event", "AER_FATAL_TRIGGER"}; + +static struct event_trigger *event_triggers[] = { -+ &mc_ce_trigger, + &mc_ue_trigger, +#ifdef HAVE_MCE -+ &mce_ce_trigger, + &mce_de_trigger, + &mce_ue_trigger, +#endif @@ -604,13 +630,10 @@ index 3031f4b..334d945 100644 +}; + +void setup_event_trigger(const char *event) - { -- char *name; -- int rc; -- char *trigger_dir = getenv("TRIGGER_DIR"); ++{ + int i, j; + struct event_trigger *trigger; -+ char *s, timeout_env[30]; ++ char *s, timeout_env[64]; + + trigger_dir = getenv("TRIGGER_DIR"); + @@ -633,11 +656,7 @@ index 3031f4b..334d945 100644 + log(ALL, LOG_NOTICE, "Setup %s trigger `%s`\n", trigger->event_name, s); + + snprintf(timeout_env, sizeof(timeout_env), "%s_TIMEOUT", trigger->env); - -- if (trigger_dir) { -- if (asprintf(&name, "%s/%s", trigger_dir, s) < 0) -- return NULL; -- s = name; ++ + trigger->timeout = 1; + s = getenv(timeout_env); + if (!s || !strcmp(s, "")) { @@ -745,8 +764,6 @@ index 3031f4b..334d945 100644 + __run_mce_trigger(e, &mce_ue_trigger); + else if (e->status & MCI_STATUS_DEFERRED) + __run_mce_trigger(e, &mce_de_trigger); -+ else -+ __run_mce_trigger(e, &mce_ce_trigger); +} + +static void __run_mc_trigger(struct ras_mc_event *ev, struct event_trigger *trigger) @@ -797,9 +814,6 @@ index 3031f4b..334d945 100644 + +void run_mc_event_trigger(struct ras_mc_event *e) +{ -+ if (!strcmp(e->error_type, "Corrected")) -+ __run_mc_trigger(e, &mc_ce_trigger); -+ + if (!strcmp(e->error_type, "Uncorrected")) + __run_mc_trigger(e, &mc_ue_trigger); +} @@ -812,7 +826,8 @@ index 3031f4b..334d945 100644 + + if (!trigger->path || !strcmp(trigger->path, "")) + return; -+ + +- return NULL; + if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) + goto free; + if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0) @@ -833,8 +848,7 @@ index 3031f4b..334d945 100644 + for (i = 0; i < ei; i++) + free(env[i]); +} - -- return NULL; ++ +void run_mf_event_trigger(struct ras_mf_event *e) +{ + __run_mf_trigger(e, &mf_trigger); @@ -861,8 +875,8 @@ index 3031f4b..334d945 100644 + goto free; + if (ev->tlp_header_valid) + if (asprintf(&env[ei++], "TLP_HEADER=%08x %08x %08x %08x", -+ ev->tlp_header[0], ev->tlp_header[1], -+ ev->tlp_header[2], ev->tlp_header[3]) < 0) ++ ev->tlp_header[0], ev->tlp_header[1], ++ ev->tlp_header[2], ev->tlp_header[3]) < 0) + goto free; + if (asprintf(&env[ei++], "MSG=%s", ev->msg) < 0) + goto free; @@ -887,10 +901,10 @@ index 3031f4b..334d945 100644 + __run_aer_trigger(e, &aer_fatal_trigger); } diff --git a/trigger.h b/trigger.h -index 0cc9df5..8d42176 100644 +index 7d25042..31eff96 100644 --- a/trigger.h +++ b/trigger.h -@@ -1,12 +1,23 @@ +@@ -3,12 +3,23 @@ #ifndef __TRIGGER_H__ #define __TRIGGER_H__ @@ -919,5 +933,5 @@ index 0cc9df5..8d42176 100644 #endif -- -2.33.1 +2.43.5 diff --git a/1012-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch b/1012-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch deleted file mode 100644 index a04f8e6fd90217810fb9af645b9f0794fa8259fc..0000000000000000000000000000000000000000 --- a/1012-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch +++ /dev/null @@ -1,128 +0,0 @@ -From fb4f603ad5ac035df16569ec9aa6b7117301ebf8 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Thu, 28 Apr 2022 18:58:43 +0100 -Subject: [PATCH 12/85] rasdaemon: ras-mc-ctl: Updated HiSilicon platform name - -Updated the HiSilicon platform name as KunPeng9xx. - -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - util/ras-mc-ctl.in | 24 ++++++++++++------------ - 1 file changed, 12 insertions(+), 12 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 83ef9de..e765519 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1529,7 +1529,7 @@ sub errors - - # Definitions of the vendor platform IDs. - use constant { -- HISILICON_KUNPENG_9XX => "Kunpeng9xx", -+ HISILICON_KUNPENG_9XX => "KunPeng9xx", - }; - - sub vendor_errors_summary -@@ -1552,7 +1552,7 @@ sub vendor_errors_summary - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -- # HiSilicon Kunpeng9xx errors -+ # HiSilicon KunPeng9xx errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { - $found_platform = 1; - $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; -@@ -1569,7 +1569,7 @@ sub vendor_errors_summary - $out .= "\t$module_id: $count\n"; - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx OEM type1 error events summary:\n$out\n"; -+ print "HiSilicon KunPeng9xx OEM type1 error events summary:\n$out\n"; - } - $query_handle->finish; - -@@ -1587,7 +1587,7 @@ sub vendor_errors_summary - $out .= "\t$module_id: $count\n"; - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx OEM type2 error events summary:\n$out\n"; -+ print "HiSilicon KunPeng9xx OEM type2 error events summary:\n$out\n"; - } - $query_handle->finish; - -@@ -1605,7 +1605,7 @@ sub vendor_errors_summary - $out .= "\t$sub_module_id: $count\n"; - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx PCIe controller error events summary:\n$out\n"; -+ print "HiSilicon KunPeng9xx PCIe controller error events summary:\n$out\n"; - } - $query_handle->finish; - -@@ -1623,7 +1623,7 @@ sub vendor_errors_summary - $out .= "\t$module_id: $count\n"; - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx common error events summary:\n$out\n"; -+ print "HiSilicon KunPeng9xx common error events summary:\n$out\n"; - } - $query_handle->finish; - } -@@ -1660,7 +1660,7 @@ sub vendor_errors - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -- # HiSilicon Kunpeng9xx errors -+ # HiSilicon KunPeng9xx errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { - $found_platform = 1; - $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; -@@ -1684,7 +1684,7 @@ sub vendor_errors - } - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx OEM type1 error events:\n$out\n"; -+ print "HiSilicon KunPeng9xx OEM type1 error events:\n$out\n"; - } - $query_handle->finish; - -@@ -1709,7 +1709,7 @@ sub vendor_errors - } - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx OEM type2 error events:\n$out\n"; -+ print "HiSilicon KunPeng9xx OEM type2 error events:\n$out\n"; - } - $query_handle->finish; - -@@ -1736,7 +1736,7 @@ sub vendor_errors - } - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx PCIe controller error events:\n$out\n"; -+ print "HiSilicon KunPeng9xx PCIe controller error events:\n$out\n"; - } - $query_handle->finish; - -@@ -1767,7 +1767,7 @@ sub vendor_errors - } - } - if ($out ne "") { -- print "HiSilicon Kunpeng9xx common error events:\n$out\n"; -+ print "HiSilicon KunPeng9xx common error events:\n$out\n"; - } - $query_handle->finish; - } -@@ -1784,7 +1784,7 @@ sub vendor_errors - sub vendor_platforms - { - print "\nSupported platforms for the vendor-specific errors:\n"; -- print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; -+ print "\tHiSilicon KunPeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; - print "\n"; - } - --- -2.33.1 - diff --git a/1013-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch b/1013-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch deleted file mode 100644 index b0ad59b7ceca23ef3fa172e6f9ecbcd5cc1894c1..0000000000000000000000000000000000000000 --- a/1013-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch +++ /dev/null @@ -1,56 +0,0 @@ -From ce25490736f8596d13711700999c16424b3b2487 Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Wed, 20 Oct 2021 14:33:37 +0800 -Subject: [PATCH 13/85] rasdaemon: Fix the issue of sprintf data type mismatch - in uuid_le() - -The data type of sprintf called in the function uuid_le() is mismatch. -Arm64 compiler force it to unsigned char by default, and can work normally. -But if someone compile it with the option -fsigned-char, the function -can't work correctly. - -Signed-off-by: Xiaofei Tan -Signed-off-by: Mauro Carvalho Chehab ---- - ras-extlog-handler.c | 2 +- - ras-non-standard-handler.c | 4 ++-- - 2 files changed, 3 insertions(+), 3 deletions(-) - -diff --git a/ras-extlog-handler.c b/ras-extlog-handler.c -index 5fd3580..1834687 100644 ---- a/ras-extlog-handler.c -+++ b/ras-extlog-handler.c -@@ -152,7 +152,7 @@ static char *uuid_le(const char *uu) - static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; - - for (i = 0; i < 16; i++) { -- p += sprintf(p, "%.2x", uu[le[i]]); -+ p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]); - switch (i) { - case 3: - case 5: -diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c -index 6ccf5bc..6d5a6f8 100644 ---- a/ras-non-standard-handler.c -+++ b/ras-non-standard-handler.c -@@ -36,7 +36,7 @@ static char *uuid_le(const char *uu) - static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; - - for (i = 0; i < 16; i++) { -- p += sprintf(p, "%.2x", uu[le[i]]); -+ p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]); - switch (i) { - case 3: - case 5: -@@ -61,7 +61,7 @@ static int uuid_le_cmp(const char *sec_type, const char *uuid2) - 3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15}; - - for (i = 0; i < 16; i++) -- p += sprintf(p, "%.2x", sec_type[le[i]]); -+ p += sprintf(p, "%.2x", (unsigned char) sec_type[le[i]]); - *p = 0; - return strncmp(uuid1, uuid2, 32); - } --- -2.33.1 - diff --git a/1013-rasdaemon-add-event-level-for-event-record.patch b/1013-rasdaemon-add-event-level-for-event-record.patch new file mode 100644 index 0000000000000000000000000000000000000000..f7a98e95140ad9d67d3b8b8ed5f3f7097e3b083a --- /dev/null +++ b/1013-rasdaemon-add-event-level-for-event-record.patch @@ -0,0 +1,489 @@ +From 06f2f2a77aa546dcd5b0cb002869d08b8a016e5e Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 28 Mar 2025 13:19:47 +0800 +Subject: [PATCH] rasdaemon: add event level for event record + +To help users distinguish more and more events, this patch introduces +event levels to indicate the severity of the current event to the +system. Currently, three main levels are used: Alert, Crit, Error. +Fatal events will be marked as "emerg" but in reality, the kernel +will panic upon receiving a fatal event, so rasdaemon does not +receive it. + +ALERT: The uncorrected hardware error has been fixed, but cause + side effects. +CRIT: The uncorrected hardware error has been detected. +ERROR: The corrected hardware error has been detected. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 2 +- + man/rasdaemon.1.in | 15 +++++++++ + ras-aer-handler.c | 22 +++++++++++-- + ras-arm-handler.c | 2 ++ + ras-cxl-handler.c | 7 ++++ + ras-devlink-handler.c | 2 ++ + ras-diskerror-handler.c | 1 + + ras-extlog-handler.c | 20 +++++++++++ + ras-mc-handler.c | 64 +++++++++++++++++++++++------------- + ras-mce-handler.c | 9 +++++ + ras-memory-failure-handler.c | 1 + + ras-nvgpu-nvml.c | 4 +-- + ras-page-isolation.c | 5 +-- + ras-poison-page-stat.c | 4 +-- + ras-signal-handler.c | 4 +-- + types.c | 18 ++++++++++ + types.h | 11 +++++++ + 17 files changed, 156 insertions(+), 35 deletions(-) + create mode 100644 types.c + +diff --git a/Makefile.am b/Makefile.am +index 72f30b4..564a20d 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -52,7 +52,7 @@ all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(RSYSLOG_SERVICES) $(LOGROTAT + + sbin_PROGRAMS = rasdaemon + rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ +- bitfield.c trigger.c ++ bitfield.c trigger.c types.c + if WITH_SQLITE3 + rasdaemon_SOURCES += ras-record.c + endif +diff --git a/man/rasdaemon.1.in b/man/rasdaemon.1.in +index e884e55..2288fd9 100644 +--- a/man/rasdaemon.1.in ++++ b/man/rasdaemon.1.in +@@ -72,6 +72,21 @@ environment variables. By default the config file is read from /etc/sysconfig/ra + + The general format is environmentname=value. + ++.SH LOG LEVEL ++ ++Each log entry has a level prefix that describes the severity of the log to ++help users determine which logs are more valuable. ++Currently, three levels are used:.TP ++ ++.B "ALERT" ++The uncorrected hardware error has been fixed, but cause side effects. ++.TP ++.B "CRIT" ++The uncorrected hardware error has been detected. ++.TP ++.B "ERROR" ++The corrected hardware error has been detected. ++ + .SH SEE ALSO + \fBras-mc-ctl\fR(8) + +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index 471ad9f..c67f267 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -123,6 +123,25 @@ int ras_aer_event_handler(struct trace_seq *s, + uint8_t sel_data[5]; + int seg, bus, dev, fn, rc; + #endif ++ const char *level; ++ ++ if (tep_get_field_val(s, event, "severity", record, &severity_val, 1) < 0) ++ return -1; ++ switch (severity_val) { ++ case HW_EVENT_AER_UNCORRECTED_NON_FATAL: ++ level = loglevel_str[LOGLEVEL_CRIT]; ++ break; ++ case HW_EVENT_AER_UNCORRECTED_FATAL: ++ level = loglevel_str[LOGLEVEL_EMERG]; ++ break; ++ case HW_EVENT_AER_CORRECTED: ++ level = loglevel_str[LOGLEVEL_ERR]; ++ break; ++ default: ++ level = loglevel_str[LOGLEVEL_DEBUG]; ++ break; ++ } ++ trace_seq_printf(s, "%s ", level); + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. +@@ -156,9 +175,6 @@ int ras_aer_event_handler(struct trace_seq *s, + if (tep_get_field_val(s, event, "status", record, &status_val, 1) < 0) + return -1; + +- if (tep_get_field_val(s, event, "severity", record, &severity_val, 1) < 0) +- return -1; +- + /* Fills the error buffer. If it is a correctable error then use the + * aer_cor_errors bit field. Otherwise use aer_uncor_errors. + */ +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +index db29327..226feb3 100644 +--- a/ras-arm-handler.c ++++ b/ras-arm-handler.c +@@ -489,6 +489,8 @@ int ras_arm_event_handler(struct trace_seq *s, + + memset(&ev, 0, sizeof(ev)); + ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); ++ + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. + * On previous kernels, the way to properly generate an event would +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 6e5ddea..575fff8 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -133,6 +133,7 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + struct ras_cxl_poison_event ev; + ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; +@@ -345,6 +346,7 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s, + struct ras_cxl_aer_ue_event ev; + + memset(&ev, 0, sizeof(ev)); ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_CRIT]); + get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; +@@ -431,6 +433,7 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + struct ras_cxl_aer_ce_event ev; + ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; +@@ -516,6 +519,7 @@ int ras_cxl_overflow_event_handler(struct trace_seq *s, + struct ras_cxl_overflow_event ev; + + memset(&ev, 0, sizeof(ev)); ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; +@@ -733,6 +737,7 @@ int ras_cxl_generic_event_handler(struct trace_seq *s, + const uint8_t *buf; + + memset(&ev, 0, sizeof(ev)); ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) + return -1; + +@@ -848,6 +853,7 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s, + struct ras_cxl_general_media_event ev; + + memset(&ev, 0, sizeof(ev)); ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) + return -1; + +@@ -1038,6 +1044,7 @@ int ras_cxl_dram_event_handler(struct trace_seq *s, + struct ras_cxl_dram_event ev; + + memset(&ev, 0, sizeof(ev)); ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) + return -1; + +diff --git a/ras-devlink-handler.c b/ras-devlink-handler.c +index da5645d..93eba91 100644 +--- a/ras-devlink-handler.c ++++ b/ras-devlink-handler.c +@@ -83,6 +83,8 @@ int ras_devlink_event_handler(struct trace_seq *s, + if (ras->filters[DEVLINK_EVENT] && + tep_filter_match(ras->filters[DEVLINK_EVENT], record) == FILTER_MATCH) + return 0; ++ ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. + * On previous kernels, the way to properly generate an event would +diff --git a/ras-diskerror-handler.c b/ras-diskerror-handler.c +index 43c023b..6044efa 100644 +--- a/ras-diskerror-handler.c ++++ b/ras-diskerror-handler.c +@@ -57,6 +57,7 @@ int ras_diskerror_event_handler(struct trace_seq *s, + struct diskerror_event ev; + uint32_t dev; + ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ERR]); + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. + * On previous kernels, the way to properly generate an event would +diff --git a/ras-extlog-handler.c b/ras-extlog-handler.c +index 46c06cf..56acf1a 100644 +--- a/ras-extlog-handler.c ++++ b/ras-extlog-handler.c +@@ -208,6 +208,26 @@ static void report_extlog_mem_event(struct ras_events *ras, + struct trace_seq *s, + struct ras_extlog_event *ev) + { ++ const char *level; ++ ++ switch (ev->severity) { ++ case 0: ++ level = loglevel_str[LOGLEVEL_CRIT]; ++ break; ++ case 1: ++ level = loglevel_str[LOGLEVEL_EMERG]; ++ break; ++ case 2: ++ level = loglevel_str[LOGLEVEL_ERR]; ++ break; ++ case 3: ++ level = loglevel_str[LOGLEVEL_INFO]; ++ break; ++ default: ++ level = loglevel_str[LOGLEVEL_DEBUG]; ++ break; ++ } ++ trace_seq_printf(s, "%s ", level); + trace_seq_printf(s, "%d %s error: %s physical addr: 0x%llx mask: 0x%llx%s %s %s", + ev->error_seq, err_severity(ev->severity), + err_type(ev->etype), ev->address, +diff --git a/ras-mc-handler.c b/ras-mc-handler.c +index a729d93..e55c199 100644 +--- a/ras-mc-handler.c ++++ b/ras-mc-handler.c +@@ -36,7 +36,7 @@ static int ras_mc_event_stat(time_t now, struct ras_mc_event *e) + } + + if (per_sec_ce_count > mc_ce_stat_threshold) +- log(ALL, LOG_ERR, " mc_event_stat: memory corrected error report %lld/sec\n", per_sec_ce_count); ++ log(ALL, LOG_ERR, " mc_event_stat: %s memory corrected error report %lld/sec\n", loglevel_str[LOGLEVEL_ALERT], per_sec_ce_count); + + return 0; + } +@@ -52,6 +52,46 @@ int ras_mc_event_handler(struct trace_seq *s, + struct tm *tm; + struct ras_mc_event ev; + int parsed_fields = 0; ++ const char *level; ++ ++ if (tep_get_field_val(s, event, "error_type", record, &val, 1) < 0) ++ goto parse_error; ++ parsed_fields++; ++ ++ switch (val) { ++ case HW_EVENT_ERR_CORRECTED: ++ ev.error_type = "Corrected"; ++ break; ++ case HW_EVENT_ERR_UNCORRECTED: ++ ev.error_type = "Uncorrected"; ++ break; ++ case HW_EVENT_ERR_DEFERRED: ++ ev.error_type = "Deferred"; ++ break; ++ case HW_EVENT_ERR_FATAL: ++ ev.error_type = "Fatal"; ++ break; ++ case HW_EVENT_ERR_INFO: ++ default: ++ ev.error_type = "Info"; ++ } ++ ++ switch (val) { ++ case HW_EVENT_ERR_UNCORRECTED: ++ case HW_EVENT_ERR_DEFERRED: ++ level = loglevel_str[LOGLEVEL_CRIT]; ++ break; ++ case HW_EVENT_ERR_FATAL: ++ level = loglevel_str[LOGLEVEL_EMERG]; ++ break; ++ case HW_EVENT_ERR_CORRECTED: ++ level = loglevel_str[LOGLEVEL_ERR]; ++ break; ++ default: ++ level = loglevel_str[LOGLEVEL_DEBUG]; ++ break; ++ } ++ trace_seq_printf(s, "%s ", level); + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. +@@ -80,28 +120,6 @@ int ras_mc_event_handler(struct trace_seq *s, + ev.error_count = val; + trace_seq_printf(s, "%d ", ev.error_count); + +- if (tep_get_field_val(s, event, "error_type", record, &val, 1) < 0) +- goto parse_error; +- parsed_fields++; +- +- switch (val) { +- case HW_EVENT_ERR_CORRECTED: +- ev.error_type = "Corrected"; +- break; +- case HW_EVENT_ERR_UNCORRECTED: +- ev.error_type = "Uncorrected"; +- break; +- case HW_EVENT_ERR_DEFERRED: +- ev.error_type = "Deferred"; +- break; +- case HW_EVENT_ERR_FATAL: +- ev.error_type = "Fatal"; +- break; +- case HW_EVENT_ERR_INFO: +- default: +- ev.error_type = "Info"; +- } +- + trace_seq_puts(s, ev.error_type); + if (ev.error_count > 1) + trace_seq_puts(s, " errors:"); +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 92c5339..c272bb0 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -290,7 +290,16 @@ void report_mce_event(struct ras_events *ras, struct tep_record *record, + time_t now; + struct tm *tm; + struct mce_priv *mce = ras->mce_priv; ++ const char *level; + ++ if (e->status & MCI_STATUS_UC) ++ level = loglevel_str[LOGLEVEL_CRIT]; ++ else if (e->status & MCI_STATUS_DEFERRED) ++ level = loglevel_str[LOGLEVEL_CRIT]; ++ else ++ level = loglevel_str[LOGLEVEL_ERR]; ++ ++ trace_seq_printf(s, "%s ", level); + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. + * On previous kernels, the way to properly generate an event would +diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c +index 0f4e937..43e7c5d 100644 +--- a/ras-memory-failure-handler.c ++++ b/ras-memory-failure-handler.c +@@ -119,6 +119,7 @@ int ras_memory_failure_event_handler(struct trace_seq *s, + struct tm *tm; + struct ras_mf_event ev; + ++ trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ALERT]); + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. + * On previous kernels, the way to properly generate an event would +diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c +index aabe8f9..2758d14 100644 +--- a/ras-nvgpu-nvml.c ++++ b/ras-nvgpu-nvml.c +@@ -58,12 +58,12 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) + if (data->eventType == nvmlEventTypeXidCriticalError) { + trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", + "<...>", 0, -1, "....", 0.0f, XID_EVENT_NAME); +- trace_seq_printf(&s, "%s ", timestamp); ++ trace_seq_printf(&s, "%s %s ", loglevel_str[LOGLEVEL_CRIT], timestamp); + trace_seq_printf(&s, "xid: %lld ", data->eventData); + } else { + trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", + "<...>", 0, -1, "....", 0.0f, NVGPU_EVENT_NAME); +- trace_seq_printf(&s, "%s ", timestamp); ++ trace_seq_printf(&s, "%s %s ", loglevel_str[LOGLEVEL_CRIT], timestamp); + trace_seq_printf(&s, "event_type: %s(%llx) ", my_nvmlEventTypeString(data->eventType), data->eventType); + trace_seq_printf(&s, "data: %lld ", data->eventData); + } +diff --git a/ras-page-isolation.c b/ras-page-isolation.c +index 246cd12..237495c 100644 +--- a/ras-page-isolation.c ++++ b/ras-page-isolation.c +@@ -17,6 +17,7 @@ + #include "ras-page-isolation.h" + #include "ras-poison-page-stat.h" + #include "ras-record.h" ++#include "types.h" + + #define PARSED_ENV_LEN 50 + #define ROW_ID_MAX_LEN 200 +@@ -349,8 +350,8 @@ static void page_offline(struct page_record *pr) + + pr->offlined = ret < 0 ? PAGE_OFFLINE_FAILED : PAGE_OFFLINE; + +- log(TERM, LOG_INFO, "Result of offlining page at %#llx: %s\n", +- addr, page_state[pr->offlined]); ++ log(TERM, LOG_INFO, "%s Result of offlining page at %#llx: %s\n", ++ loglevel_str[LOGLEVEL_ALERT], addr, page_state[pr->offlined]); + + #ifdef HAVE_POISON_PAGE_STAT + ras_poison_page_stat(); +diff --git a/ras-poison-page-stat.c b/ras-poison-page-stat.c +index 2ce1d2a..c8d8859 100644 +--- a/ras-poison-page-stat.c ++++ b/ras-poison-page-stat.c +@@ -34,8 +34,8 @@ int ras_poison_page_stat(void) + fclose(fp); + + if (corrupted_kb > poison_stat_threshold) +- log(ALL, LOG_WARNING, "Poison page statistics exceeded threshold: %lld kB (threshold: %lld kB)\n", +- corrupted_kb, poison_stat_threshold); ++ log(ALL, LOG_WARNING, "%s Poison page statistics exceeded threshold: %lld kB (threshold: %lld kB)\n", ++ loglevel_str[LOGLEVEL_ALERT], corrupted_kb, poison_stat_threshold); + + return 0; + } +diff --git a/ras-signal-handler.c b/ras-signal-handler.c +index fb0bfd3..c497bf0 100644 +--- a/ras-signal-handler.c ++++ b/ras-signal-handler.c +@@ -61,8 +61,8 @@ static char *signal_res[] = { + static void report_ras_signal_event(struct trace_seq *s, struct ras_signal_event *ev) + { + trace_seq_printf(s, +- "%s signal: %s, errorno: %d, code: %s, comm: %s, pid: %d, grp: %d, res: %s, msg: %s", +- ev->timestamp, strsignal(ev->sig), ev->error_no, ++ "%s %s signal: %s, errorno: %d, code: %s, comm: %s, pid: %d, grp: %d, res: %s, msg: %s", ++ loglevel_str[LOGLEVEL_ALERT], ev->timestamp, strsignal(ev->sig), ev->error_no, + (ev->code < 0 || ev->code > BUS_MCEERR_AO) ? "Unknown" : errcode_str[ev->code], + ev->comm, ev->pid, + ev->group, +diff --git a/types.c b/types.c +new file mode 100644 +index 0000000..d4270ac +--- /dev/null ++++ b/types.c +@@ -0,0 +1,18 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++ * Copyright (C) 2025 Alibaba Inc ++ */ ++ ++#include "types.h" ++ ++const char *loglevel_str[] = { ++ [LOGLEVEL_EMERG] = "[EMERG]", ++ [LOGLEVEL_ALERT] = "[ALERT]", ++ [LOGLEVEL_CRIT] = "[CRIT]", ++ [LOGLEVEL_ERR] = "[ERROR]", ++ [LOGLEVEL_WARNING] = "[WARNING]", ++ [LOGLEVEL_NOTICE] = "[NOTICE]", ++ [LOGLEVEL_INFO] = "[INFO]", ++ [LOGLEVEL_DEBUG] = "[DEBUG]", ++}; +\ No newline at end of file +diff --git a/types.h b/types.h +index 58cac1f..8563919 100644 +--- a/types.h ++++ b/types.h +@@ -189,4 +189,15 @@ static inline size_t strscat(char *dst, const char *src, size_t dsize) + "pointer type mismatch in container_of()"); \ + ((type *)(__mptr - offsetof(type, member))); }) + ++#define LOGLEVEL_DEFAULT -1 /* default (or last) loglevel */ ++#define LOGLEVEL_EMERG 0 /* system is unusable */ ++#define LOGLEVEL_ALERT 1 /* action must be taken immediately */ ++#define LOGLEVEL_CRIT 2 /* critical conditions */ ++#define LOGLEVEL_ERR 3 /* error conditions */ ++#define LOGLEVEL_WARNING 4 /* warning conditions */ ++#define LOGLEVEL_NOTICE 5 /* normal but significant condition */ ++#define LOGLEVEL_INFO 6 /* informational */ ++#define LOGLEVEL_DEBUG 7 /* debug-level messages */ ++ ++extern const char *loglevel_str[]; + #endif +-- +2.43.5 + diff --git a/1014-anolis-syslog-add-rasdaemon.ext.patch b/1014-anolis-syslog-add-rasdaemon.ext.patch new file mode 100644 index 0000000000000000000000000000000000000000..9bbc6f3727b9361d9131ae4481c74ea01799bfb3 --- /dev/null +++ b/1014-anolis-syslog-add-rasdaemon.ext.patch @@ -0,0 +1,250 @@ +From b4e1a8c87a7c079c35db5190067808df4ae471a6 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 3 Apr 2025 15:16:09 +0800 +Subject: [PATCH 14/30] anolis: syslog: add rasdaemon.ext + +Filter aer/pcihp/cmcistorm event through syslog-ng/rsyslog + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 24 ++++++++++- + misc/rasdaemon.rsyslog-ext.in | 26 ++++++++++++ + misc/rasdaemon.spec.in | 10 +++++ + misc/rasdaemon.syslog-ng-ext.in | 71 +++++++++++++++++++++++++++++++++ + 5 files changed, 131 insertions(+), 2 deletions(-) + create mode 100644 misc/rasdaemon.rsyslog-ext.in + create mode 100644 misc/rasdaemon.syslog-ng-ext.in + +diff --git a/Makefile.am b/Makefile.am +index 564a20d..ab26412 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -11,17 +11,25 @@ LOGROTATE_SERVICES_IN = misc/rasdaemon.logrotate.in + LOGROTATE_SERVICES = $(LOGROTATE_SERVICES_IN:.logrotate.in=.logrotate) + RSYSLOG_SERVICES_IN = misc/rasdaemon.rsyslog.in + RSYSLOG_SERVICES = $(RSYSLOG_SERVICES_IN:.rsyslog.in=.rsyslog) ++SYSLOG_EXT_SERVICES_IN = misc/rasdaemon.syslog-ng-ext.in ++SYSLOG_EXT_SERVICES = $(SYSLOG_EXT_SERVICES_IN:.syslog-ng-ext.in=.syslog-ng-ext) ++RSYSLOG_EXT_SERVICES_IN = misc/rasdaemon.rsyslog-ext.in ++RSYSLOG_EXT_SERVICES = $(RSYSLOG_EXT_SERVICES_IN:.rsyslog-ext.in=.rsyslog-ext) + EXTRA_DIST = \ + $(SYSTEMD_SERVICES_IN) \ + $(SYSLOG_SERVICES_IN) \ + $(RSYSLOG_SERVICES_IN) \ + $(LOGROTATE_SERVICES_IN) \ ++ $(SYSLOG_EXT_SERVICES_IN) \ ++ $(RSYSLOG_EXT_SERVICES_IN) \ + misc/rasdaemon.env \ + contrib/nvml.py \ + contrib/*_trigger + + CLEANFILES= \ + ras-nvgpu-nvml.h \ ++ misc/rasdaemon.syslog-ng-ext \ ++ misc/rasdaemon.rsyslog-ext \ + misc/ras-mc-ctl.service \ + misc/rasdaemon.service \ + misc/rasdaemon.syslog-ng \ +@@ -33,7 +41,7 @@ DISTCLEANFILES = misc/rasdaemon.spec + # This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin + # during ./configure phase, therefore it is not possible to add .service.in + # files to AC_CONFIG_FILES in configure.ac +-SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng .rsyslog.in .rsyslog ++SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-ng .rsyslog.in .rsyslog .rsyslog-ext.in .rsyslog-ext .syslog-ng-ext.in .syslog-ng-ext + .service.in.service: + sed -e s,\@sbindir\@,$(sbindir),g -e s,\@SYSCONFDEFDIR\@,@SYSCONFDEFDIR@,g $< > $@ + +@@ -46,9 +54,15 @@ SUFFIXES = .service.in .service .logrotate.in .logrotate .syslog-ng.in .syslog-n + .rsyslog.in.rsyslog: + sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@ + ++.syslog-ng-ext.in.syslog-ng-ext: ++ sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@ ++ ++.rsyslog-ext.in.rsyslog-ext: ++ sed -e s,\@sbindir\@,$(sbindir),g -e s,\@localstatedir\@,${localstatedir},g $< > $@ ++ + # This rule is needed because the service files must be generated on target + # system after ./configure phase +-all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(RSYSLOG_SERVICES) $(LOGROTATE_SERVICES) ++all-local: $(SYSTEMD_SERVICES) $(SYSLOG_SERVICES) $(RSYSLOG_SERVICES) $(LOGROTATE_SERVICES) $(SYSLOG_EXT_SERVICES) $(RSYSLOG_EXT_SERVICES) + + sbin_PROGRAMS = rasdaemon + rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ +@@ -179,4 +193,10 @@ install-data-local: + if [ -d "$(DESTDIR)@sysconfdir@/logrotate.d" ]; then \ + install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.logrotate "$(DESTDIR)@sysconfdir@/logrotate.d/rasdaemon"; \ + fi ++ if [ -d "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/" ]; then \ ++ install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.syslog-ng-ext "$(DESTDIR)@sysconfdir@/syslog-ng/conf.d/rasdaemon.syslog-ng-ext"; \ ++ fi ++ if [ -d "$(DESTDIR)@sysconfdir@/rsyslog.d/" ]; then \ ++ install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.rsyslog-ext "$(DESTDIR)@sysconfdir@/rsyslog.d/rasdaemon.rsyslog-ext"; \ ++ fi + $(install_sh) @abs_srcdir@/contrib/*_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/" +diff --git a/misc/rasdaemon.rsyslog-ext.in b/misc/rasdaemon.rsyslog-ext.in +new file mode 100644 +index 0000000..63cffc2 +--- /dev/null ++++ b/misc/rasdaemon.rsyslog-ext.in +@@ -0,0 +1,26 @@ ++# SPDX-License-Identifier: GPL-2.0 ++ ++template(name="rasdaemon_temp" type="string" string="%timegenerated% %hostname% rasdaemon: %$!event%: %$!level% %msg%\n") ++ ++if ($syslogfacility-text == "kern" and $msg contains "CMCI storm") then { ++ set $!event = "cmci_storm"; ++ ++ if $msg contains "detected" then set $!level = "[ALERT]"; ++ if $msg contains "subsided" then set $!level = "[ERROR]"; ++ action(type="omfile" file="/var/log/rasdaemon" template="rasdaemon_temp") ++} ++ ++if ($syslogfacility-text == "kern" and $msg contains "AER: device recovery") then { ++ set $!event = "aer_recovery"; ++ ++ if $msg contains "failed" then set $!level = "[EMERG]"; ++ if $msg contains "successful" then set $!level = "[ALERT]"; ++ action(type="omfile" file="/var/log/rasdaemon" template="rasdaemon_temp") ++} ++ ++if ($syslogfacility-text == "kern" and $msg contains "pciehp: Slot") then { ++ set $!event = "pciehp"; ++ if $msg contains "Link Down" then set $!level = "[ALERT]"; ++ if $msg contains "Card not present" then set $!level = "[ALERT]"; ++ action(type="omfile" file="/var/log/rasdaemon" template="rasdaemon_temp") ++} +\ No newline at end of file +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index a30045c..521f148 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -57,6 +57,8 @@ install -D -p -m 0655 misc/%{name}.env %{buildroot}%{_sysconfdir}/sysconfig/%{na + install -D -p -m 0655 misc/%{name}.syslog-ng %{buildroot}/usr/share/%{name}/%{name}.syslog-ng + install -D -p -m 0655 misc/%{name}.logrotate %{buildroot}/usr/share/%{name}/%{name}.logrotate + install -D -p -m 0655 misc/%{name}.rsyslog %{buildroot}/usr/share/%{name}/%{name}.rsyslog ++install -D -p -m 0655 misc/%{name}.rsyslog-ext %{buildroot}/usr/share/%{name}/%{name}.rsyslog-ext ++install -D -p -m 0655 misc/%{name}.syslog-ng-ext %{buildroot}/usr/share/%{name}/%{name}.syslog-ng-ext + rm INSTALL %{buildroot}/usr/include/*.h + + %files +@@ -71,18 +73,24 @@ rm INSTALL %{buildroot}/usr/include/*.h + %config(noreplace) /usr/share/%{name}/%{name}.syslog-ng + %config(noreplace) /usr/share/%{name}/%{name}.logrotate + %config(noreplace) /usr/share/%{name}/%{name}.rsyslog ++%config(noreplace) /usr/share/%{name}/%{name}.syslog-ng-ext ++%config(noreplace) /usr/share/%{name}/%{name}.rsyslog-ext + + %post + if systemctl is-active --quiet syslog-ng.service; then + echo "Syslog service is enabled and running, create config file and restart it"; + rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; + ln -s /usr/share/%{name}/%{name}.syslog-ng %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; ++ rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf; ++ ln -s /usr/share/%{name}/%{name}.syslog-ng-ext %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf; + systemctl restart syslog-ng.service; + fi + if systemctl is-active --quiet rsyslog.service; then + echo "Rsyslog service is enabled and running, create config file and restart it"; + rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf; + ln -s /usr/share/%{name}/%{name}.rsyslog %{_sysconfdir}/rsyslog.d/%{name}.conf; ++ rm -rf %{_sysconfdir}/rsyslog.d/%{name}-ext.conf; ++ ln -s /usr/share/%{name}/%{name}.rsyslog-ext %{_sysconfdir}/rsyslog.d/%{name}-ext.conf; + systemctl restart rsyslog.service; + fi + if [ -d "%{_sysconfdir}/logrotate.d" ]; then +@@ -103,11 +111,13 @@ systemctl disable %{name}.service + if systemctl is-active --quiet syslog-ng.service; then + echo "Syslog-ng service is enabled and running, delete config file and restart it"; + rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}.conf; ++ rm -rf %{_sysconfdir}/syslog-ng/conf.d/%{name}-ext.conf; + systemctl restart syslog-ng.service; + fi + if systemctl is-active --quiet rsyslog.service; then + echo "Rsyslog service is enabled and running, delete config file and restart it"; + rm -rf %{_sysconfdir}/rsyslog.d/%{name}.conf; ++ rm -rf %{_sysconfdir}/rsyslog.d/%{name}-ext.conf; + systemctl restart rsyslog.service; + fi + if [ -d "%{_sysconfdir}/logrotate.d" ]; then +diff --git a/misc/rasdaemon.syslog-ng-ext.in b/misc/rasdaemon.syslog-ng-ext.in +new file mode 100644 +index 0000000..ad001d2 +--- /dev/null ++++ b/misc/rasdaemon.syslog-ng-ext.in +@@ -0,0 +1,71 @@ ++# SPDX-License-Identifier: GPL-2.0 ++ ++destination d_ras { ++ file("/var/log/rasdaemon" ++ template("${DATE} ${HOST} rasdaemon: ${RASDAEMON_EVENT}: ${RASDAEMON_LEVEL} ${MESSAGE}\n") ++ persist-name(ras-ext)); ++}; ++ ++filter f_aer { ++ facility(kern) and ++ match("AER: device recovery" value("MESSAGE")); ++}; ++ ++rewrite r_aer { ++ set("aer_recovery", value("RASDAEMON_EVENT")); ++ set("[EMERG]", value("RASDAEMON_LEVEL") ++ condition(match("failed" value("MESSAGE"))) ++ ); ++ set("[ALERT]", value("RASDAEMON_LEVEL") ++ condition(match("successful" value("MESSAGE"))) ++ ); ++}; ++ ++filter f_cmcistorm { ++ facility(kern) and ++ match("CMCI storm" value("MESSAGE")); ++}; ++ ++rewrite r_cmcistorm { ++ set("cmci_storm", value("RASDAEMON_EVENT")); ++ set("[ALERT]", value("RASDAEMON_LEVEL") ++ condition(match("detected" value("MESSAGE"))) ++ ); ++ set("[ERROR]", value("RASDAEMON_LEVEL") ++ condition(match("subsided" value("MESSAGE"))) ++ ); ++}; ++ ++filter f_pciehp { ++ facility(kern) and ++ match("pciehp: Slot" value("MESSAGE")); ++}; ++ ++rewrite r_pciehp { ++ set("pciehp", value("RASDAEMON_EVENT")); ++ set("[ALERT]", value("RASDAEMON_LEVEL") ++ condition(match("Link Down" value("MESSAGE"))) ++ ); ++ set("[ALERT]", value("RASDAEMON_LEVEL") ++ condition(match("Card not present" value("MESSAGE"))) ++ ); ++}; ++ ++log { ++ source(s_sys); ++ junction { ++ channel { ++ filter(f_cmcistorm); ++ rewrite(r_cmcistorm); ++ }; ++ channel { ++ filter(f_pciehp); ++ rewrite(r_pciehp); ++ }; ++ channel { ++ filter(f_aer); ++ rewrite(r_aer); ++ }; ++ }; ++ destination(d_ras); ++}; +\ No newline at end of file +-- +2.43.5 + diff --git a/1014-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch b/1014-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch deleted file mode 100644 index 2de55aa308e9bfac4eebaf206f99c3cb19c110d9..0000000000000000000000000000000000000000 --- a/1014-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch +++ /dev/null @@ -1,73 +0,0 @@ -From 049fc251c32b9a9eaf15a183df451a6a5c937f43 Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Wed, 20 Oct 2021 14:33:38 +0800 -Subject: [PATCH 14/85] rasdaemon: Fix the issue of command option -r for hip08 - -It will record event even the option -r is not provided for hip08. -It is not right, and fix it. - -Signed-off-by: Xiaofei Tan -Signed-off-by: Mauro Carvalho Chehab ---- - non-standard-hisi_hip08.c | 6 +++--- - non-standard-hisilicon.c | 6 ++++++ - 2 files changed, 9 insertions(+), 3 deletions(-) - -diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c -index ebf03e1..9092183 100644 ---- a/non-standard-hisi_hip08.c -+++ b/non-standard-hisi_hip08.c -@@ -670,7 +670,7 @@ static int decode_hip08_oem_type1_error(struct ras_events *ras, - } - - #ifdef HAVE_SQLITE3 -- if (!ev_decoder->stmt_dec_record) { -+ if (ras->record_events && !ev_decoder->stmt_dec_record) { - if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, - &hip08_oem_type1_event_tab) - != SQLITE_OK) { -@@ -842,7 +842,7 @@ static int decode_hip08_oem_type2_error(struct ras_events *ras, - } - - #ifdef HAVE_SQLITE3 -- if (!ev_decoder->stmt_dec_record) { -+ if (ras->record_events && !ev_decoder->stmt_dec_record) { - if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, - &hip08_oem_type2_event_tab) != SQLITE_OK) { - trace_seq_printf(s, -@@ -992,7 +992,7 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras, - } - - #ifdef HAVE_SQLITE3 -- if (!ev_decoder->stmt_dec_record) { -+ if (ras->record_events && !ev_decoder->stmt_dec_record) { - if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, - &hip08_pcie_local_event_tab) != SQLITE_OK) { - trace_seq_printf(s, -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index cd0ab3f..8da891f 100644 ---- a/non-standard-hisilicon.c -+++ b/non-standard-hisilicon.c -@@ -91,6 +91,9 @@ void record_vendor_data(struct ras_ns_ev_decoder *ev_decoder, - enum hisi_oem_data_type data_type, - int id, int64_t data, const char *text) - { -+ if (ev_decoder->stmt_dec_record == NULL) -+ return; -+ - switch (data_type) { - case HISI_OEM_DATA_TYPE_INT: - sqlite3_bind_int(ev_decoder->stmt_dec_record, id, data); -@@ -108,6 +111,9 @@ int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name) - { - int rc; - -+ if (ev_decoder->stmt_dec_record == NULL) -+ return 0; -+ - rc = sqlite3_step(ev_decoder->stmt_dec_record); - if (rc != SQLITE_OK && rc != SQLITE_DONE) - log(TERM, LOG_ERR, --- -2.33.1 - diff --git a/1015-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch b/1015-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch deleted file mode 100644 index 872eb5d977908479be60e996ace956b5248daa0d..0000000000000000000000000000000000000000 --- a/1015-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch +++ /dev/null @@ -1,36 +0,0 @@ -From 243b0d9bc40dc8cb10490eb14604cf750bc65e56 Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Wed, 20 Oct 2021 14:33:40 +0800 -Subject: [PATCH 15/85] rasdaemon: Add some modules supported by hisi common - error section - -Add some modules supported by hisi common error section. Besides, -HHA is the module for some old platform, and it takes the same place -of MATA, so remove it. - -Signed-off-by: Xiaofei Tan -Signed-off-by: Mauro Carvalho Chehab ---- - non-standard-hisilicon.c | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index 8da891f..d1e1774 100644 ---- a/non-standard-hisilicon.c -+++ b/non-standard-hisilicon.c -@@ -210,7 +210,11 @@ static const char* module_name[] = { - "SEC", - "RDE", - "MEE", -- "HHA", -+ "L4D", -+ "Tsensor", -+ "ROH", -+ "BTC", -+ "HILINK" - }; - - static const char* get_soc_desc(uint8_t soc_id) --- -2.33.1 - diff --git a/1076-rasdaemon-add-page-offline-trigger.patch b/1015-rasdaemon-add-page-offline-trigger.patch similarity index 55% rename from 1076-rasdaemon-add-page-offline-trigger.patch rename to 1015-rasdaemon-add-page-offline-trigger.patch index a6ca360c81532a08ef446a9eb7dad7fe97fce845..20480a043a15f5ababeca2ba1492d6c2d5fc88f1 100644 --- a/1076-rasdaemon-add-page-offline-trigger.patch +++ b/1015-rasdaemon-add-page-offline-trigger.patch @@ -1,27 +1,32 @@ -From af1941853b6da51bf985c0b4819ffd7556572600 Mon Sep 17 00:00:00 2001 +From e9995846c39321300a9c89936086222fab3cbb1c Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Fri, 13 Dec 2024 14:38:02 +0800 -Subject: [PATCH 76/85] rasdaemon: add page offline trigger +Subject: [PATCH 15/30] rasdaemon: add page offline trigger page offline include pre trigger and post trigger. + +Signed-off-by: Ruidong Tian --- - contrib/page_offline_trigger | 15 ++++++++++++ - misc/rasdaemon.env | 5 ++++ - ras-page-isolation.c | 4 ++++ - trigger.c | 43 ++++++++++++++++++++++++++++++++++- - trigger.h | 6 +++++ - 5 files changed, 72 insertions(+), 1 deletion(-) - create mode 100755 contrib/page_offline_trigger + contrib/page_offline_post_trigger | 25 ++++++++++++++++++ + contrib/page_offline_pre_trigger | 25 ++++++++++++++++++ + misc/rasdaemon.env | 5 ++++ + ras-page-isolation.c | 4 +++ + trigger.c | 43 +++++++++++++++++++++++++++++++ + trigger.h | 6 +++++ + 6 files changed, 108 insertions(+) + create mode 100755 contrib/page_offline_post_trigger + create mode 100755 contrib/page_offline_pre_trigger -diff --git a/contrib/page_offline_trigger b/contrib/page_offline_trigger +diff --git a/contrib/page_offline_post_trigger b/contrib/page_offline_post_trigger new file mode 100755 -index 0000000..867390b +index 0000000..4d3329c --- /dev/null -+++ b/contrib/page_offline_trigger -@@ -0,0 +1,15 @@ ++++ b/contrib/page_offline_post_trigger +@@ -0,0 +1,25 @@ +#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 +# This shell script can be executed by rasdaemon in daemon mode when a -+# memory_failure_event is occured, environment variables include all ++# memory_failure_event is occurred, environment variables include all +# information reported by tracepoint. + +# environment: @@ -30,36 +35,79 @@ index 0000000..867390b +# OTYPE POST | PRE +# + -+[ -x ./page_offline_trigger.local ] && . ./page_offline_trigger.local ++[ -x ./page_offline_post_trigger.local ] && . ./page_offline_post_trigger.local ++ ++if [ -d page_offline_post_trigger.extern ] ++then ++ ls page_offline_post_trigger.extern | ++ while read item ++ do ++ [ -x ./page_offline_post_trigger.extern/$item ] && . ./page_offline_post_trigger.extern/$item $1 ++ done ++fi ++ ++ ++exit 0 +diff --git a/contrib/page_offline_pre_trigger b/contrib/page_offline_pre_trigger +new file mode 100755 +index 0000000..e464382 +--- /dev/null ++++ b/contrib/page_offline_pre_trigger +@@ -0,0 +1,25 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon in daemon mode when a ++# memory_failure_event is occurred, environment variables include all ++# information reported by tracepoint. ++ ++# environment: ++# TIMESTAMP Timestamp when error occurred ++# ADDR Address ++# OTYPE POST | PRE ++# ++ ++[ -x ./page_offline_pre_trigger.local ] && . ./page_offline_pre_trigger.local ++ ++if [ -d page_offline_pre_trigger.extern ] ++then ++ ls page_offline_pre_trigger.extern | ++ while read item ++ do ++ [ -x ./page_offline_pre_trigger.extern/$item ] && . ./page_offline_pre_trigger.extern/$item $1 ++ done ++fi + + +exit 0 diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 9293038..67f488f 100644 +index 1f5da55..f3f17c2 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env -@@ -86,3 +86,8 @@ AER_CE_TRIGGER_TIMEOUT=0 +@@ -108,6 +108,11 @@ AER_CE_TRIGGER_TIMEOUT=0 AER_UE_TRIGGER_TIMEOUT=0 AER_FATAL_TRIGGER_TIMEOUT=0 -+# trigger for page offline +PRE_PAGE_OFFLINE_TRIGGER= +POST_PAGE_OFFLINE_TRIGGER= +PRE_PAGE_OFFLINE_TRIGGER_TIMEOUT=0 +POST_PAGE_OFFLINE_TRIGGER_TIMEOUT=0 ++ + # CE Statistic Threshold + # + # Specify the threshold of CE per second. diff --git a/ras-page-isolation.c b/ras-page-isolation.c -index d19ce26..16fb082 100644 +index 237495c..569293f 100644 --- a/ras-page-isolation.c +++ b/ras-page-isolation.c -@@ -23,6 +23,7 @@ - - #include "ras-logger.h" - #include "ras-page-isolation.h" +@@ -18,6 +18,7 @@ + #include "ras-poison-page-stat.h" + #include "ras-record.h" + #include "types.h" +#include "trigger.h" #define PARSED_ENV_LEN 50 - static const struct config threshold_units[] = { -@@ -211,6 +212,7 @@ void ras_page_account_init(void) + #define ROW_ID_MAX_LEN 200 +@@ -296,6 +297,7 @@ void ras_page_account_init(void) { page_offline_init(); page_isolation_init(); @@ -67,7 +115,7 @@ index d19ce26..16fb082 100644 } static int do_page_offline(unsigned long long addr, enum otype type) -@@ -218,6 +220,7 @@ static int do_page_offline(unsigned long long addr, enum otype type) +@@ -303,6 +305,7 @@ static int do_page_offline(unsigned long long addr, enum otype type) int fd, rc; char buf[20]; @@ -75,7 +123,7 @@ index d19ce26..16fb082 100644 fd = open(kernel_offline[type], O_WRONLY); if (fd == -1) { log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, -@@ -233,6 +236,7 @@ static int do_page_offline(unsigned long long addr, enum otype type) +@@ -318,6 +321,7 @@ static int do_page_offline(unsigned long long addr, enum otype type) buf, kernel_offline[type], errno); close(fd); @@ -84,19 +132,18 @@ index d19ce26..16fb082 100644 } diff --git a/trigger.c b/trigger.c -index 334d945..00f0cbc 100644 +index a13fffd..7387113 100644 --- a/trigger.c +++ b/trigger.c -@@ -8,7 +8,7 @@ +@@ -11,6 +11,7 @@ #include "types.h" #include "trigger.h" --#include "ras-event.c" +#include "ras-events.h" #include "ras-mce-handler.h" #define MAX_ENV 30 -@@ -96,6 +96,9 @@ struct event_trigger aer_ce_trigger = {"aer_event", "AER_CE_TRIGGER"}; +@@ -95,6 +96,9 @@ struct event_trigger aer_ce_trigger = {"aer_event", "AER_CE_TRIGGER"}; struct event_trigger aer_ue_trigger = {"aer_event", "AER_UE_TRIGGER"}; struct event_trigger aer_fatal_trigger = {"aer_event", "AER_FATAL_TRIGGER"}; @@ -104,9 +151,9 @@ index 334d945..00f0cbc 100644 +struct event_trigger post_page_offline_trigger = {"page_offline", "POST_PAGE_OFFLINE_TRIGGER"}; + static struct event_trigger *event_triggers[] = { - &mc_ce_trigger, &mc_ue_trigger, -@@ -112,6 +115,10 @@ static struct event_trigger *event_triggers[] = { + #ifdef HAVE_MCE +@@ -109,6 +113,10 @@ static struct event_trigger *event_triggers[] = { &aer_ue_trigger, &aer_fatal_trigger, #endif @@ -117,10 +164,10 @@ index 334d945..00f0cbc 100644 }; void setup_event_trigger(const char *event) -@@ -365,6 +372,31 @@ static void __run_aer_trigger(struct ras_aer_event *ev, struct event_trigger *tr - ev->tlp_header[2], ev->tlp_header[3]) < 0) +@@ -358,6 +366,32 @@ static void __run_aer_trigger(struct ras_aer_event *ev, struct event_trigger *tr goto free; if (asprintf(&env[ei++], "MSG=%s", ev->msg) < 0) + goto free; + env[ei] = NULL; + assert(ei < MAX_ENV); + @@ -146,10 +193,11 @@ index 334d945..00f0cbc 100644 + if (asprintf(&env[ei++], "ADDR=%#llx", addr) < 0) + goto free; + if (asprintf(&env[ei++], "OTYPE=%d", otype) < 0) - goto free; ++ goto free; env[ei] = NULL; -@@ -386,3 +418,12 @@ void run_aer_event_trigger(struct ras_aer_event *e) + assert(ei < MAX_ENV); +@@ -378,3 +412,12 @@ void run_aer_event_trigger(struct ras_aer_event *e) else if (!strcmp(e->error_type, "Uncorrected (Fatal)")) __run_aer_trigger(e, &aer_fatal_trigger); } @@ -163,10 +211,10 @@ index 334d945..00f0cbc 100644 +} + diff --git a/trigger.h b/trigger.h -index 8d42176..d1b50c3 100644 +index 31eff96..74df3d3 100644 --- a/trigger.h +++ b/trigger.h -@@ -3,6 +3,11 @@ +@@ -5,6 +5,11 @@ #include "ras-record.h" @@ -178,7 +226,7 @@ index 8d42176..d1b50c3 100644 struct event_trigger { const char *event_name; const char *env; -@@ -19,5 +24,6 @@ void run_mc_event_trigger(struct ras_mc_event *e); +@@ -21,5 +26,6 @@ void run_mc_event_trigger(struct ras_mc_event *e); void run_mce_record_trigger(struct mce_event *e); void run_mf_event_trigger(struct ras_mf_event *e); void run_aer_event_trigger(struct ras_aer_event *e); @@ -186,5 +234,5 @@ index 8d42176..d1b50c3 100644 #endif -- -2.33.1 +2.43.5 diff --git a/1016-anolis-compta-rasdaemon-notices.patch b/1016-anolis-compta-rasdaemon-notices.patch new file mode 100644 index 0000000000000000000000000000000000000000..e13915c789c78aa257f0159edd013fc0a0ad9070 --- /dev/null +++ b/1016-anolis-compta-rasdaemon-notices.patch @@ -0,0 +1,129 @@ +From c1182ad260e0161817d0a4bbea31bcfe5fe7dbd3 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 13 Dec 2024 14:38:02 +0800 +Subject: [PATCH 16/30] anolis: compta rasdaemon notices + +page offline include pre trigger and post trigger. + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 1 + + contrib/page_offline_post_trigger | 2 ++ + contrib/page_offline_pre_trigger | 2 ++ + misc/notices/page-ce-offline-post-notice | 16 ++++++++++++++++ + misc/notices/page-ce-offline-pre-notice | 18 ++++++++++++++++++ + misc/rasdaemon.spec.in | 3 +++ + 6 files changed, 42 insertions(+) + create mode 100644 misc/notices/page-ce-offline-post-notice + create mode 100644 misc/notices/page-ce-offline-pre-notice + +diff --git a/Makefile.am b/Makefile.am +index ab26412..61f9a84 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -23,6 +23,7 @@ EXTRA_DIST = \ + $(SYSLOG_EXT_SERVICES_IN) \ + $(RSYSLOG_EXT_SERVICES_IN) \ + misc/rasdaemon.env \ ++ misc/notices \ + contrib/nvml.py \ + contrib/*_trigger + +diff --git a/contrib/page_offline_post_trigger b/contrib/page_offline_post_trigger +index 4d3329c..ad7d44c 100755 +--- a/contrib/page_offline_post_trigger ++++ b/contrib/page_offline_post_trigger +@@ -12,6 +12,8 @@ + + [ -x ./page_offline_post_trigger.local ] && . ./page_offline_post_trigger.local + ++[ -x /etc/rasdaemon_notices/page-ce-offline-post-notice ] && . /etc/rasdaemon_notices/page-ce-offline-post-notice $(printf "%lu" "$ADDR") ++ + if [ -d page_offline_post_trigger.extern ] + then + ls page_offline_post_trigger.extern | +diff --git a/contrib/page_offline_pre_trigger b/contrib/page_offline_pre_trigger +index e464382..6d8d3f2 100755 +--- a/contrib/page_offline_pre_trigger ++++ b/contrib/page_offline_pre_trigger +@@ -12,6 +12,8 @@ + + [ -x ./page_offline_pre_trigger.local ] && . ./page_offline_pre_trigger.local + ++[ -x /etc/rasdaemon_notices/page-ce-offline-pre-notice ] && . /etc/rasdaemon_notices/page-ce-offline-pre-notice $(printf "%lu" "$ADDR") ++ + if [ -d page_offline_pre_trigger.extern ] + then + ls page_offline_pre_trigger.extern | +diff --git a/misc/notices/page-ce-offline-post-notice b/misc/notices/page-ce-offline-post-notice +new file mode 100644 +index 0000000..01966af +--- /dev/null ++++ b/misc/notices/page-ce-offline-post-notice +@@ -0,0 +1,16 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon after a page goes offline. ++ ++cd /etc/rasdaemon_notices/ ++ ++[ -x ./page-ce-offline-post-notice.local ] && . ./page-ce-offline-post-notice.local $1 ++ ++if [ -d page-ce-offline-post-notice.extern ] ++then ++ ls page-ce-offline-post-notice.extern | ++ while read item ++ do ++ [ -x ./page-ce-offline-post-notice.extern/$item ] && . ./page-ce-offline-post-notice.extern/$item $1 ++ done ++fi +\ No newline at end of file +diff --git a/misc/notices/page-ce-offline-pre-notice b/misc/notices/page-ce-offline-pre-notice +new file mode 100644 +index 0000000..187556c +--- /dev/null ++++ b/misc/notices/page-ce-offline-pre-notice +@@ -0,0 +1,18 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon before a page goes offline. ++ ++cd /etc/rasdaemon_notices/ ++ ++[ -x ./page-ce-offline-pre-notice.local ] && . ./page-ce-offline-pre-notice.local $1 ++ ++if [ -d page-ce-offline-pre-notice.extern ] ++then ++ ls page-ce-offline-pre-notice.extern | ++ while read item ++ do ++ [ -x ./page-ce-offline-pre-notice.extern/$item ] && . ./page-ce-offline-pre-notice.extern/$item $1 ++ done ++fi ++ ++exit 0 +\ No newline at end of file +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index 521f148..23be188 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -59,6 +59,8 @@ install -D -p -m 0655 misc/%{name}.logrotate %{buildroot}/usr/share/%{name}/%{na + install -D -p -m 0655 misc/%{name}.rsyslog %{buildroot}/usr/share/%{name}/%{name}.rsyslog + install -D -p -m 0655 misc/%{name}.rsyslog-ext %{buildroot}/usr/share/%{name}/%{name}.rsyslog-ext + install -D -p -m 0655 misc/%{name}.syslog-ng-ext %{buildroot}/usr/share/%{name}/%{name}.syslog-ng-ext ++install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ ++install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ + rm INSTALL %{buildroot}/usr/include/*.h + + %files +@@ -75,6 +77,7 @@ rm INSTALL %{buildroot}/usr/include/*.h + %config(noreplace) /usr/share/%{name}/%{name}.rsyslog + %config(noreplace) /usr/share/%{name}/%{name}.syslog-ng-ext + %config(noreplace) /usr/share/%{name}/%{name}.rsyslog-ext ++%{_sysconfdir}/rasdaemon_notices/* + + %post + if systemctl is-active --quiet syslog-ng.service; then +-- +2.43.5 + diff --git a/1016-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch b/1016-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch deleted file mode 100644 index e08b2d2c4ea2c01d46017ade7555f932d267274c..0000000000000000000000000000000000000000 --- a/1016-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch +++ /dev/null @@ -1,91 +0,0 @@ -From 7a9ec6f75efa7cb1e590c231900720ef6fe32b46 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Thu, 28 Apr 2022 22:59:04 +0100 -Subject: [PATCH 16/85] rasdaemon: Fix for a memory out-of-bounds issue and - optimized code to remove duplicate function. - -Fixed a memory out-of-bounds issue with string pointers and -optimized code structure to remove duplicate function. - -Signed-off-by: Lei Feng -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - non-standard-hisi_hip08.c | 6 +++--- - non-standard-hisilicon.c | 2 +- - ras-non-standard-handler.c | 16 +--------------- - 3 files changed, 5 insertions(+), 19 deletions(-) - -diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c -index 9092183..4ef47ea 100644 ---- a/non-standard-hisi_hip08.c -+++ b/non-standard-hisi_hip08.c -@@ -1014,15 +1014,15 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras, - - static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = { - { -- .sec_type = "1f8161e155d641e6bd107afd1dc5f7c5", -+ .sec_type = "1f8161e1-55d6-41e6-bd10-7afd1dc5f7c5", - .decode = decode_hip08_oem_type1_error, - }, - { -- .sec_type = "45534ea6ce2341158535e07ab3aef91d", -+ .sec_type = "45534ea6-ce23-4115-8535-e07ab3aef91d", - .decode = decode_hip08_oem_type2_error, - }, - { -- .sec_type = "b2889fc9e7d74f9da867af42e98be772", -+ .sec_type = "b2889fc9-e7d7-4f9d-a867-af42e98be772", - .decode = decode_hip08_pcie_local_error, - }, - }; -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index d1e1774..6ee9271 100644 ---- a/non-standard-hisilicon.c -+++ b/non-standard-hisilicon.c -@@ -387,7 +387,7 @@ static int decode_hisi_common_section(struct ras_events *ras, - - static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = { - { -- .sec_type = "c8b328a899174af69a132e08ab2e7586", -+ .sec_type = "c8b328a8-9917-4af6-9a13-2e08ab2e7586", - .decode = decode_hisi_common_section, - }, - }; -diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c -index 6d5a6f8..6932e58 100644 ---- a/ras-non-standard-handler.c -+++ b/ras-non-standard-handler.c -@@ -52,20 +52,6 @@ static char *uuid_le(const char *uu) - return uuid; - } - --static int uuid_le_cmp(const char *sec_type, const char *uuid2) --{ -- static char uuid1[32]; -- char *p = uuid1; -- int i; -- static const unsigned char le[16] = { -- 3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15}; -- -- for (i = 0; i < 16; i++) -- p += sprintf(p, "%.2x", (unsigned char) sec_type[le[i]]); -- *p = 0; -- return strncmp(uuid1, uuid2, 32); --} -- - int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) - { - struct ras_ns_ev_decoder *list; -@@ -96,7 +82,7 @@ static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p - - ns_ev_decoder = ras_ns_ev_dec_list; - while (ns_ev_decoder) { -- if (uuid_le_cmp(sec_type, ns_ev_decoder->sec_type) == 0) { -+ if (strcmp(uuid_le(sec_type), ns_ev_decoder->sec_type) == 0) { - *p_ns_ev_dec = ns_ev_decoder; - match = 1; - break; --- -2.33.1 - diff --git a/1077-rasdaemon-add-rasdaemon-json-exporter.patch b/1017-anolis-rasdaemon-add-rasdaemon-json-exporter.patch similarity index 66% rename from 1077-rasdaemon-add-rasdaemon-json-exporter.patch rename to 1017-anolis-rasdaemon-add-rasdaemon-json-exporter.patch index 6951800f778bc2b317b91da9ac4083ba33e02d0c..637a05b8de3e0623722d23c6d5129e05b3e06d05 100644 --- a/1077-rasdaemon-add-rasdaemon-json-exporter.patch +++ b/1017-anolis-rasdaemon-add-rasdaemon-json-exporter.patch @@ -1,31 +1,32 @@ -From ece0a63689ee54498bb0d372cfb568e431ada5d5 Mon Sep 17 00:00:00 2001 +From 637a69ee5de5376eb185ea390cd07d8b9e5d4747 Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Mon, 9 Dec 2024 16:28:54 +0800 -Subject: [PATCH 77/85] rasdaemon: add rasdaemon json exporter +Subject: [PATCH 17/30] anolis: rasdaemon: add rasdaemon json exporter Signed-off-by: Ruidong Tian --- - Makefile.am | 5 +- + Makefile.am | 3 + configure.ac | 16 +++ misc/rasdaemon.env | 2 + - ras-aer-handler.c | 9 ++ - ras-arm-handler.c | 4 + - ras-mc-handler.c | 9 ++ - ras-mce-handler.c | 5 + + ras-aer-handler.c | 9 +- + ras-arm-handler.c | 6 +- + ras-mc-handler.c | 11 +- + ras-mce-handler.c | 7 +- ras-mce-handler.h | 1 + - ras-memory-failure-handler.c | 4 + + ras-memory-failure-handler.c | 6 +- ras-record.h | 9 ++ - ras-report-json.c | 240 +++++++++++++++++++++++++++++++++++ - ras-report.h | 14 ++ - rasdaemon.c | 9 +- - 13 files changed, 325 insertions(+), 2 deletions(-) + ras-report-json.c | 238 +++++++++++++++++++++++++++++++++++ + ras-report.h | 14 +++ + ras-signal-handler.c | 2 +- + rasdaemon.c | 8 ++ + 14 files changed, 326 insertions(+), 6 deletions(-) create mode 100644 ras-report-json.c diff --git a/Makefile.am b/Makefile.am -index fb0248e..2582454 100644 +index 61f9a84..1f21137 100644 --- a/Makefile.am +++ b/Makefile.am -@@ -54,6 +54,9 @@ endif +@@ -104,6 +104,9 @@ endif if WITH_ABRT_REPORT rasdaemon_SOURCES += ras-report.c endif @@ -35,21 +36,12 @@ index fb0248e..2582454 100644 if WITH_HISI_NS_DECODE rasdaemon_SOURCES += non-standard-hisi_hip08.c non-standard-hisilicon.c endif -@@ -70,7 +73,7 @@ if WITH_YITIAN_NS_DECODE - rasdaemon_SOURCES += non-standard-yitian.c - endif - --rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a -+rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a $(PCI_LIBS) - - include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ diff --git a/configure.ac b/configure.ac -index 135af9c..c8b4ab6 100644 +index 43d845d..c5164ec 100644 --- a/configure.ac +++ b/configure.ac -@@ -131,6 +131,21 @@ AS_IF([test "x$enable_abrt_report" = "xyes" || test "x$enable_all" == "xyes"], [ - AM_CONDITIONAL([WITH_ABRT_REPORT], [test x$enable_abrt_report = xyes || test x$enable_all == xyes]) +@@ -170,6 +170,21 @@ AS_IF([test "x$enable_abrt_report" = "xyes" || test "x$enable_all" = "xyes"], [ + AM_CONDITIONAL([WITH_ABRT_REPORT], [test x$enable_abrt_report = xyes || test x$enable_all = xyes]) AM_COND_IF([WITH_ABRT_REPORT], [USE_ABRT_REPORT="yes"], [USE_ABRT_REPORT="no"]) +AC_ARG_ENABLE([json_report], @@ -70,63 +62,69 @@ index 135af9c..c8b4ab6 100644 AC_ARG_ENABLE([hisi_ns_decode], AS_HELP_STRING([--enable-hisi-ns-decode], [enable HISI_NS_DECODE events (currently experimental)])) -@@ -223,4 +238,5 @@ compile time options summary - AMP RAS errors : $USE_AMP_NS_DECODE - CPU fault isolation : $USE_CPU_FAULT_ISOLATION - YITIAN RAS errors : $USE_YITIAN_NS_DECODE +@@ -337,4 +352,5 @@ compile time options summary + Signal : $USE_SIGNAL + ERST : $USE_ERST + NVGPU RAS errors : $USE_NVGPU + Json exporter : $USE_JSON_REPORT EOF diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 67f488f..760a42d 100644 +index f3f17c2..085d839 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env -@@ -45,6 +45,8 @@ CPU_ISOLATION_CYCLE="24h" +@@ -73,6 +73,8 @@ CPU_ISOLATION_CYCLE="24h" # Prevent excessive isolation from causing an avalanche effect CPU_ISOLATION_LIMIT="10" ++DISABLE="json_report" + -+DISABLE="" # Event Trigger # Event trigger will be executed when the specified event occurs. diff --git a/ras-aer-handler.c b/ras-aer-handler.c -index b00703e..d0eb4df 100644 +index c67f267..023dd4d 100644 --- a/ras-aer-handler.c +++ b/ras-aer-handler.c -@@ -22,6 +22,8 @@ - #include "libtrace/kbuffer.h" - #include "ras-aer-handler.h" - #include "types.h" -+#include "ras-events.h" -+#include "ras-record.h" - #include "ras-logger.h" - #include "bitfield.h" - #include "ras-report.h" -@@ -135,18 +137,22 @@ int ras_aer_event_handler(struct trace_seq *s, - case HW_EVENT_AER_UNCORRECTED_NON_FATAL: - ev.error_type = "Uncorrected (Non-Fatal)"; +@@ -115,7 +115,7 @@ int ras_aer_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_aer_event ev; ++ struct ras_aer_event ev = { 0 }; + char buf[BUF_LEN] = { 0 }; + uint16_t vendor_id = 0, device_id = 0; + #ifdef HAVE_AMP_NS_DECODE +@@ -207,24 +207,28 @@ int ras_aer_event_handler(struct trace_seq *s, + #ifdef HAVE_AMP_NS_DECODE sel_data[0] = 0xca; + #endif + ev.severity = GHES_SEV_RECOVERABLE; break; case HW_EVENT_AER_UNCORRECTED_FATAL: ev.error_type = "Uncorrected (Fatal)"; + #ifdef HAVE_AMP_NS_DECODE sel_data[0] = 0xca; + #endif + ev.severity = GHES_SEV_PANIC; break; case HW_EVENT_AER_CORRECTED: ev.error_type = "Corrected"; + #ifdef HAVE_AMP_NS_DECODE sel_data[0] = 0xbf; + #endif + ev.severity = GHES_SEV_CORRECTED; break; default: ev.error_type = "Unknown severity"; + #ifdef HAVE_AMP_NS_DECODE sel_data[0] = 0xbf; + #endif + ev.severity = GHES_SEV_NO; } trace_seq_puts(s, ev.error_type); -@@ -184,6 +190,9 @@ int ras_aer_event_handler(struct trace_seq *s, - system(ipmi_add_sel); +@@ -271,6 +275,9 @@ int ras_aer_event_handler(struct trace_seq *s, + return -1; #endif +#ifdef HAVE_JSON_REPORT @@ -136,10 +134,19 @@ index b00703e..d0eb4df 100644 return 0; diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 19150cb..97ebe21 100644 +index 226feb3..431dd9b 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c -@@ -264,5 +264,9 @@ int ras_arm_event_handler(struct trace_seq *s, +@@ -484,7 +484,7 @@ int ras_arm_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_arm_event ev; ++ struct ras_arm_event ev = { 0 }; + int len = 0; + + memset(&ev, 0, sizeof(ev)); +@@ -606,5 +606,9 @@ int ras_arm_event_handler(struct trace_seq *s, ras_report_arm_event(ras, &ev); #endif @@ -150,18 +157,27 @@ index 19150cb..97ebe21 100644 return 0; } diff --git a/ras-mc-handler.c b/ras-mc-handler.c -index a270637..bb93c9d 100644 +index e55c199..2ffaf2e 100644 --- a/ras-mc-handler.c +++ b/ras-mc-handler.c -@@ -29,6 +29,7 @@ +@@ -17,6 +17,7 @@ + #include "ras-mc-handler.h" #include "ras-page-isolation.h" - #include "types.h" #include "ras-report.h" +#include "ras-events.h" #include "trigger.h" + #include "types.h" - int ras_mc_event_handler(struct trace_seq *s, -@@ -77,16 +78,20 @@ int ras_mc_event_handler(struct trace_seq *s, +@@ -50,7 +51,7 @@ int ras_mc_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_mc_event ev; ++ struct ras_mc_event ev = { 0 }; + int parsed_fields = 0; + const char *level; + +@@ -61,19 +62,23 @@ int ras_mc_event_handler(struct trace_seq *s, switch (val) { case HW_EVENT_ERR_CORRECTED: ev.error_type = "Corrected"; @@ -171,6 +187,9 @@ index a270637..bb93c9d 100644 ev.error_type = "Uncorrected"; + ev.severity = GHES_SEV_RECOVERABLE; break; + case HW_EVENT_ERR_DEFERRED: + ev.error_type = "Deferred"; + break; case HW_EVENT_ERR_FATAL: ev.error_type = "Fatal"; + ev.severity = GHES_SEV_PANIC; @@ -181,8 +200,8 @@ index a270637..bb93c9d 100644 + ev.severity = GHES_SEV_NO; } - trace_seq_puts(s, ev.error_type); -@@ -202,6 +207,10 @@ int ras_mc_event_handler(struct trace_seq *s, + switch (val) { +@@ -249,6 +254,10 @@ int ras_mc_event_handler(struct trace_seq *s, run_mc_event_trigger(&ev); @@ -194,18 +213,27 @@ index a270637..bb93c9d 100644 parse_error: diff --git a/ras-mce-handler.c b/ras-mce-handler.c -index 9601704..ecc6468 100644 +index c272bb0..b61976a 100644 --- a/ras-mce-handler.c +++ b/ras-mce-handler.c -@@ -30,6 +30,7 @@ - #include "ras-logger.h" +@@ -18,6 +18,7 @@ #include "ras-report.h" + #include "types.h" #include "trigger.h" +#include "ras-events.h" /* - * The code below were adapted from Andi Kleen/Intel/SuSe mcelog code, -@@ -581,6 +582,10 @@ int ras_mce_event_handler(struct trace_seq *s, + * The code below were adapted from Andi Kleen/Intel/SUSE mcelog code, +@@ -507,7 +508,7 @@ int ras_mce_event_handler(struct trace_seq *s, + unsigned long long val; + struct ras_events *ras = context; + struct mce_priv *mce = ras->mce_priv; +- struct mce_event e; ++ struct mce_event e = { 0 }; + int rc = 0; + + memset(&e, 0, sizeof(e)); +@@ -608,6 +609,10 @@ int ras_mce_event_handler(struct trace_seq *s, ras_report_mce_event(ras, &e); #endif @@ -217,22 +245,31 @@ index 9601704..ecc6468 100644 return 0; diff --git a/ras-mce-handler.h b/ras-mce-handler.h -index e1064f6..f0dbdab 100644 +index f120874..d2031cf 100644 --- a/ras-mce-handler.h +++ b/ras-mce-handler.h -@@ -76,6 +76,7 @@ struct mce_event { - uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */ - uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */ +@@ -68,6 +68,7 @@ struct mce_event { + int32_t vdata_len; + const uint64_t *vdata; + int severity; /* Parsed data */ + char frutext[17]; char timestamp[64]; - char bank_name[64]; diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c -index 8bc7a9d..9cd56b4 100644 +index 43e7c5d..df90244 100644 --- a/ras-memory-failure-handler.c +++ b/ras-memory-failure-handler.c -@@ -174,5 +174,9 @@ int ras_memory_failure_event_handler(struct trace_seq *s, +@@ -117,7 +117,7 @@ int ras_memory_failure_event_handler(struct trace_seq *s, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_mf_event ev; ++ struct ras_mf_event ev = { 0 }; + + trace_seq_printf(s, "%s ", loglevel_str[LOGLEVEL_ALERT]); + /* +@@ -172,5 +172,9 @@ int ras_memory_failure_event_handler(struct trace_seq *s, #endif run_mf_event_trigger(&ev); @@ -243,10 +280,10 @@ index 8bc7a9d..9cd56b4 100644 return 0; } diff --git a/ras-record.h b/ras-record.h -index f1edcc0..f48fe37 100644 +index ce7d12c..7f49b74 100644 --- a/ras-record.h +++ b/ras-record.h -@@ -25,6 +25,13 @@ +@@ -16,6 +16,13 @@ #include "config.h" #include "types.h" @@ -260,7 +297,7 @@ index f1edcc0..f48fe37 100644 extern long user_hz; struct ras_events; -@@ -32,6 +39,7 @@ struct ras_events; +@@ -23,6 +30,7 @@ struct ras_events; struct ras_mc_event { char timestamp[64]; int error_count; @@ -268,20 +305,20 @@ index f1edcc0..f48fe37 100644 const char *error_type, *msg, *label; unsigned char mc_index; signed char top_layer, middle_layer, lower_layer; -@@ -51,6 +59,7 @@ struct ras_mc_offline_event { - struct ras_aer_event { +@@ -44,6 +52,7 @@ struct ras_aer_event { char timestamp[64]; const char *error_type; + char *dev_name; + int severity; - const char *dev_name; uint8_t tlp_header_valid; uint32_t *tlp_header; + const char *msg; diff --git a/ras-report-json.c b/ras-report-json.c new file mode 100644 -index 0000000..f59ca32 +index 0000000..b1c33a4 --- /dev/null +++ b/ras-report-json.c -@@ -0,0 +1,240 @@ +@@ -0,0 +1,238 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * @@ -304,7 +341,7 @@ index 0000000..f59ca32 +#include +#include + -+#include "libtrace/event-parse.h" ++#include "traceevent/event-parse.h" +#include "ras-report.h" + +#define NONE "" @@ -316,17 +353,17 @@ index 0000000..f59ca32 + return; + + trace_seq_printf(s, -+ "\n{ \"%s\": \"mc_event\", " \ -+ "\"timestamp\": \"%s\", " \ -+ "\"severity\": \"%s\", " \ -+ "\"error_count\": %d, " \ -+ "\"error_type\": \"%s\", " \ -+ "\"msg\": \"%s\", " \ -+ "\"label\": \"%s\", " \ -+ "\"location\": \"%d:%d:%d:%d\", " \ -+ "\"address\": \"%#llx\", " \ -+ "\"grain\": \"%#llx\", " \ -+ "\"syndrome\": \"%#llx\", " \ ++ "\n{ \"%s\": \"%s\", " ++ "\"timestamp\": \"%s\", " ++ "\"severity\": \"%s\", " ++ "\"error_count\": %d, " ++ "\"error_type\": \"%s\", " ++ "\"msg\": \"%s\", " ++ "\"label\": \"%s\", " ++ "\"location\": \"%d:%d:%d:%d\", " ++ "\"address\": \"%#llx\", " ++ "\"grain\": \"%#llx\", " ++ "\"syndrome\": \"%#llx\", " + "\"driver_detail\": \"%s\" }", + JSON_REPORT_KEY, + (*ev->timestamp) ? ev->timestamp : NONE, @@ -358,7 +395,6 @@ index 0000000..f59ca32 + if (!pci_name) + goto free; + -+ + if (sscanf(bdf, "%x:%x.%x", &bus, &device, &function) == 3) + domain = 0; + else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &device) == 3) @@ -386,13 +422,12 @@ index 0000000..f59ca32 + +free: + pci_cleanup(pacc); -+ return; +} + +void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev) +{ + char pci_name[128]; -+ u16 vendor, device; ++ u16 vendor = 0, device = 0; + + if (!s || !ev || !json_report) + return; @@ -449,8 +484,8 @@ index 0000000..f59ca32 + return; + + trace_seq_printf(s, -+ "\n{ \"%s\": \"mf_event\", \"timestamp\": \"%s\", " \ -+ "\"pfn\": %s, \"page_type\": \"%s\", " \ ++ "\n{ \"%s\": \"mf_event\", \"timestamp\": \"%s\", " ++ "\"pfn\": %s, \"page_type\": \"%s\", " + "\"action_result\": \"%s\" }", + JSON_REPORT_KEY, + (*ev->timestamp) ? ev->timestamp : NONE, @@ -472,29 +507,29 @@ index 0000000..f59ca32 + ev->severity = GHES_SEV_CORRECTED; + + trace_seq_printf(s, -+ "\n{ \"%s\": \"mce_record\", " \ -+ "\"timestamp\": \"%s\", " \ -+ "\"severity\": \"%s\", " \ -+ "\"bank\": %d, " \ -+ "\"bank_name\": \"%s\", " \ -+ "\"status\": \"%#lx\", " \ -+ "\"error_msg\": \"%s\", " \ -+ "\"mcistatus_msg\": \"%s\", " \ -+ "\"mcastatus_msg\": \"%s\", " \ -+ "\"user_action\": \"%s\", " \ -+ "\"mc_location\": \"%s\", " \ -+ "\"cpuid\": \"%#x\", " \ -+ "\"cpu\": %d, " \ -+ "\"socketid\": %d, " \ -+ "\"ip\": \"%#lx\", " \ -+ "\"cs\": \"%#x\", " \ -+ "\"misc\": \"%#lx\", " \ -+ "\"addr\": \"%#lx\", " \ -+ "\"synd\": \"%#lx\", " \ -+ "\"ipid\": \"%#lx\", " \ -+ "\"mcgstatus_msg\": \"%s\", " \ -+ "\"mcgstatus\": \"%#lx\", " \ -+ "\"mcgcap\": \"%#lx\", " \ ++ "\n{ \"%s\": \"%s\", " ++ "\"timestamp\": \"%s\", " ++ "\"severity\": \"%s\", " ++ "\"bank\": %d, " ++ "\"bank_name\": \"%s\", " ++ "\"status\": \"%#lx\", " ++ "\"error_msg\": \"%s\", " ++ "\"mcistatus_msg\": \"%s\", " ++ "\"mcastatus_msg\": \"%s\", " ++ "\"user_action\": \"%s\", " ++ "\"mc_location\": \"%s\", " ++ "\"cpuid\": \"%#x\", " ++ "\"cpu\": %d, " ++ "\"socketid\": %d, " ++ "\"ip\": \"%#lx\", " ++ "\"cs\": \"%#x\", " ++ "\"misc\": \"%#lx\", " ++ "\"addr\": \"%#lx\", " ++ "\"synd\": \"%#lx\", " ++ "\"ipid\": \"%#lx\", " ++ "\"mcgstatus_msg\": \"%s\", " ++ "\"mcgstatus\": \"%#lx\", " ++ "\"mcgcap\": \"%#lx\", " + "\"apicid\": \"%#x\" }", + JSON_REPORT_KEY, + (*ev->timestamp) ? ev->timestamp : NONE, @@ -523,10 +558,10 @@ index 0000000..f59ca32 +} + diff --git a/ras-report.h b/ras-report.h -index a2edf3c..fb15dc3 100644 +index f680a25..eeb25bb 100644 --- a/ras-report.h +++ b/ras-report.h -@@ -30,6 +30,12 @@ +@@ -23,6 +23,12 @@ /* ABRT socket file */ #define ABRT_SOCKET "/var/run/abrt/abrt.socket" @@ -538,9 +573,9 @@ index a2edf3c..fb15dc3 100644 + #ifdef HAVE_ABRT_REPORT - int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev); -@@ -54,4 +60,12 @@ static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_even - + int ras_report_mc_event(struct ras_events *ras, +@@ -115,4 +121,12 @@ static inline int ras_report_signal_event(struct ras_events *ras, + { return 0; }; #endif +#ifdef HAVE_JSON_REPORT @@ -552,25 +587,34 @@ index a2edf3c..fb15dc3 100644 +#endif + #endif +diff --git a/ras-signal-handler.c b/ras-signal-handler.c +index e8f7f1d..d15c4f6 100644 +--- a/ras-signal-handler.c ++++ b/ras-signal-handler.c +@@ -78,7 +78,7 @@ int ras_signal_event_handler(struct trace_seq *s, struct tep_record *record, + struct ras_events *ras = context; + time_t now; + struct tm *tm; +- struct ras_signal_event ev; ++ struct ras_signal_event ev = { 0 }; + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. diff --git a/rasdaemon.c b/rasdaemon.c -index 88ba1ca..e0e85c1 100644 +index 9c5f9dd..d5d2f85 100644 --- a/rasdaemon.c +++ b/rasdaemon.c -@@ -26,6 +26,7 @@ +@@ -16,6 +16,7 @@ #include "ras-logger.h" - #include "ras-events.h" + #include "ras-poison-page-stat.h" #include "ras-record.h" +#include "ras-report.h" - - /* - * Arguments(argp) handling logic and main -@@ -130,10 +131,16 @@ int main(int argc, char *argv[]) - { - struct arguments args; - int idx = -1; -- choices_disable = getenv(DISABLE); - - choices_disable = getenv(DISABLE); + #include "ras-mc-handler.h" + #include "ras-pcie-edpc.h" + #include "ras-nvgpu.h" +@@ -146,6 +147,13 @@ int main(int argc, char *argv[]) + log(TERM, LOG_INFO, "Threshold of poison page statistics is %lld kB\n", poison_stat_threshold); + #endif +#ifdef HAVE_JSON_REPORT + if (choices_disable && @@ -583,5 +627,5 @@ index 88ba1ca..e0e85c1 100644 const struct argp_option offline_options[] = { {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"}, -- -2.33.1 +2.43.5 diff --git a/1017-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch b/1017-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch deleted file mode 100644 index 52218b4ae3981f88ae9a837c3b46c374527f2b60..0000000000000000000000000000000000000000 --- a/1017-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch +++ /dev/null @@ -1,35 +0,0 @@ -From eb51a91b6a0ceb22cb93439cb7e0aa013f82ff4f Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Mon, 31 Oct 2022 18:36:26 +0800 -Subject: [PATCH 17/85] rasdaemon: Add four modules supported by HiSilicon - common section - -Add four modules supported by HiSilicon common error section. - -Signed-off-by: Xiaofei Tan -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - non-standard-hisilicon.c | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index 6ee9271..2b00ed6 100644 ---- a/non-standard-hisilicon.c -+++ b/non-standard-hisilicon.c -@@ -214,7 +214,11 @@ static const char* module_name[] = { - "Tsensor", - "ROH", - "BTC", -- "HILINK" -+ "HILINK", -+ "STARS", -+ "SDMA", -+ "UC", -+ "HBMC", - }; - - static const char* get_soc_desc(uint8_t soc_id) --- -2.33.1 - diff --git a/1078-rasdaemon-kmsg_monitor-introduce-kmsg_monitor.patch b/1018-anolis-rasdaemon-kmsg_monitor-introduce-kmsg_monitor.patch similarity index 78% rename from 1078-rasdaemon-kmsg_monitor-introduce-kmsg_monitor.patch rename to 1018-anolis-rasdaemon-kmsg_monitor-introduce-kmsg_monitor.patch index 2cb1c6e4c840bf81434a3b0bb868ff7947ea56ba..19a3fd5ceedb8a3c7913d248426f048a69ed5370 100644 --- a/1078-rasdaemon-kmsg_monitor-introduce-kmsg_monitor.patch +++ b/1018-anolis-rasdaemon-kmsg_monitor-introduce-kmsg_monitor.patch @@ -1,68 +1,60 @@ -From 7912ad49ed3a3b172a7c3110cf560d1ea5f87bc4 Mon Sep 17 00:00:00 2001 +From 340a8af496dd80a719e27e6395f96c8d75cf6f36 Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Wed, 11 Dec 2024 16:16:30 +0800 -Subject: [PATCH 78/85] rasdaemon: kmsg_monitor: introduce kmsg_monitor +Subject: [PATCH 18/30] anolis: rasdaemon: kmsg_monitor: introduce kmsg_monitor Signed-off-by: Ruidong Tian --- - Makefile.am | 7 +- + Makefile.am | 6 +- configure.ac | 11 +++ - misc/rasdaemon.env | 35 ++++++++ - ras-events.c | 104 +++++++++++++++++++--- - ras-kmsg.c | 209 +++++++++++++++++++++++++++++++++++++++++++++ - ras-kmsg.h | 47 ++++++++++ - ras-report-json.c | 63 ++++++++++++++ + misc/rasdaemon.env | 43 +++++++++- + ras-events.c | 114 +++++++++++++++++++++++-- + ras-kmsg.c | 203 +++++++++++++++++++++++++++++++++++++++++++++ + ras-kmsg.h | 47 +++++++++++ + ras-report-json.c | 68 ++++++++++++++- ras-report.h | 2 + - ras-time.c | 101 ++++++++++++++++++++++ - ras-time.h | 25 ++++++ - rasdaemon.c | 12 +++ - trigger.c | 56 ++++++++++++ + ras-time.c | 103 +++++++++++++++++++++++ + ras-time.h | 27 ++++++ + rasdaemon.c | 14 ++++ + trigger.c | 55 ++++++++++++ trigger.h | 3 + - 13 files changed, 661 insertions(+), 14 deletions(-) + 13 files changed, 685 insertions(+), 11 deletions(-) create mode 100644 ras-kmsg.c create mode 100644 ras-kmsg.h create mode 100644 ras-time.c create mode 100644 ras-time.h diff --git a/Makefile.am b/Makefile.am -index 2582454..024757d 100644 +index 1f21137..68b354b 100644 --- a/Makefile.am +++ b/Makefile.am -@@ -17,7 +17,7 @@ all-local: $(SYSTEMD_SERVICES) - - sbin_PROGRAMS = rasdaemon - rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ -- bitfield.c trigger.c -+ bitfield.c trigger.c ras-time.c - if WITH_SQLITE3 - rasdaemon_SOURCES += ras-record.c - endif -@@ -72,6 +72,9 @@ endif - if WITH_YITIAN_NS_DECODE - rasdaemon_SOURCES += non-standard-yitian.c +@@ -134,6 +134,9 @@ endif + if WITH_SIGNAL + rasdaemon_SOURCES += ras-signal-handler.c endif +if WITH_KMSG_MONITOR -+ rasdaemon_SOURCES += ras-kmsg.c ++ rasdaemon_SOURCES += ras-kmsg.c ras-time.c +endif - rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a $(PCI_LIBS) - -@@ -80,7 +83,7 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ - ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ + if WITH_POISON_PAGE_STAT + rasdaemon_SOURCES += ras-poison-page-stat.c +@@ -159,7 +162,8 @@ include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ -- ras-cpu-isolation.h queue.h non-standard-yitian.h trigger.h -+ ras-cpu-isolation.h queue.h non-standard-yitian.h trigger.h ras-kmsg.h ras-time.h + ras-cxl-handler.h ras-cpu-isolation.h queue.h non-standard-yitian.h \ + non-standard-jaguarmicro.h trigger.h unified-sel.h ras-signal-handler.h \ +- ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h ras-nvgpu.h ++ ras-poison-page-stat.h ras-erst.h ras-pcie-edpc.h ras-nvgpu.h \ ++ ras-kmsg.h ras-time.h # This rule can't be called with more than one Makefile job (like make -j8) # I can't figure out a way to fix that diff --git a/configure.ac b/configure.ac -index c8b4ab6..2136739 100644 +index c5164ec..dfb7f02 100644 --- a/configure.ac +++ b/configure.ac -@@ -196,6 +196,16 @@ AS_IF([test "x$enable_yitian_ns_decode" = "xyes" || test "x$enable_all" == "xyes - AM_CONDITIONAL([WITH_YITIAN_NS_DECODE], [test x$enable_yitian_ns_decode = xyes || test x$enable_all == xyes]) - AM_COND_IF([WITH_YITIAN_NS_DECODE], [USE_YITIAN_NS_DECODE="yes"], [USE_YITIAN_NS_DECODE="no"]) +@@ -303,6 +303,16 @@ AS_IF([test "x$enable_nvgpu" = "xyes" || test "x$enable_all" == "xyes"], [ + AM_CONDITIONAL([WITH_NVGPU], [test x$enable_nvgpu = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_NVGPU], [USE_NVGPU="yes"], [USE_NVGPU="no"]) +AC_ARG_ENABLE([kmsg_monitor], + AS_HELP_STRING([--enable-kmsg-monitor], [enable kmsg monitor (currently experimental)])) @@ -77,29 +69,46 @@ index c8b4ab6..2136739 100644 test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" -@@ -239,4 +249,5 @@ compile time options summary - CPU fault isolation : $USE_CPU_FAULT_ISOLATION - YITIAN RAS errors : $USE_YITIAN_NS_DECODE +@@ -353,4 +363,5 @@ compile time options summary + ERST : $USE_ERST + NVGPU RAS errors : $USE_NVGPU Json exporter : $USE_JSON_REPORT + Kmsg monitor : $USE_KMSG_MONITOR EOF diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 760a42d..2232a29 100644 +index 085d839..f498e24 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env -@@ -93,3 +93,38 @@ PRE_PAGE_OFFLINE_TRIGGER= - POST_PAGE_OFFLINE_TRIGGER= +@@ -73,7 +73,7 @@ CPU_ISOLATION_CYCLE="24h" + # Prevent excessive isolation from causing an avalanche effect + CPU_ISOLATION_LIMIT="10" + +-DISABLE="json_report" ++DISABLE="json_report,kmsg_monitor" + + # Event Trigger + +@@ -115,6 +115,10 @@ POST_PAGE_OFFLINE_TRIGGER= PRE_PAGE_OFFLINE_TRIGGER_TIMEOUT=0 POST_PAGE_OFFLINE_TRIGGER_TIMEOUT=0 -+ + +#trigger for kmsg +KMSG_TRIGGER= +KMSG_TRIGGER_TIMEOUT=0 ++ + # CE Statistic Threshold + # + # Specify the threshold of CE per second. +@@ -145,3 +149,40 @@ EDPC_DEVICE= + # For example: + # NVGPU_DISABLE_EVENT="0x10" # disable nvmlEventTypeClock + NVGPU_DISABLE_EVENT="0x10" ++ + +# KMSG MONITOR +KMSG_IGNORE_XID="" -+KMSG_LIMIT=0 -+KMSG_TRACE_NUM=5 ++KMSG_LIMIT=100 ++KMSG_TRACE_NUM=6 +KMSG_TRACE_END=1 + +KMSG_TRACER_NAME_0="xid" @@ -126,36 +135,51 @@ index 760a42d..2232a29 100644 +KMSG_TRACER_REGEX_4="pcieport (.*): pciehp: Slot\\(([0-9]+)\\): (Link Up|Link Down|Card present|Card not present|Link Down/Up ignored \\(recovered by DPC\\))" +KMSG_TRACER_GROUP_COUNT_4=3 +KMSG_TRACER_GROUP_KEY_4="pci_port,slot,res" ++ ++KMSG_TRACER_NAME_5="cmci_storm" ++KMSG_TRACER_REGEX_5="CMCI storm (.*): switching to .* mode" ++KMSG_TRACER_GROUP_COUNT_5=1 ++KMSG_TRACER_GROUP_KEY_5="storm" diff --git a/ras-events.c b/ras-events.c -index f944847..f5540b3 100644 +index 06f9a37..d40f29e 100644 --- a/ras-events.c +++ b/ras-events.c -@@ -26,6 +26,7 @@ +@@ -14,6 +14,8 @@ #include #include #include ++#include +#include #include #include - #include "libtrace/kbuffer.h" -@@ -47,6 +48,7 @@ - #include "ras-events.h" + #include +@@ -37,6 +39,25 @@ + #include "ras-signal-handler.h" #include "ras-record.h" #include "trigger.h" +#include "ras-kmsg.h" ++ ++#ifdef HAVE_KMSG_MONITOR ++#define NS_PER_SEC 1000000000L ++ ++static struct timespec ts_sub(struct timespec a, struct timespec b) ++{ ++ struct timespec result = { ++ .tv_sec = a.tv_sec - b.tv_sec, ++ .tv_nsec = a.tv_nsec - b.tv_nsec ++ }; ++ ++ if (result.tv_nsec < 0) { ++ result.tv_sec -= 1; ++ result.tv_nsec += NS_PER_SEC; ++ } ++ return result; ++} ++#endif /* * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -@@ -454,7 +456,7 @@ static int set_buffer_percent(struct ras_events *ras, int percent) - */ - #define LEGACY_KERNEL 255 - --static int read_ras_event_all_cpus(struct pthread_data *pdata, -+static int read_ras_event_all_cpus(struct pthread_data *pdata, - unsigned int n_cpus) - { - ssize_t size; -@@ -463,15 +465,22 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, +@@ -464,12 +485,22 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, int ready, i, count_nready; struct kbuffer *kbuf; void *page; @@ -165,31 +189,30 @@ index f944847..f5540b3 100644 int warnonce[n_cpus]; char pipe_raw[PATH_MAX]; int legacy_kernel = 0; --#if 0 +#ifdef HAVE_KMSG_MONITOR + int fd_num = n_cpus + 2; + char kmsg_buf[PRINTK_MESSAGE_MAX]; + int limit = 0; -+ clock_t limit_time = clock(); - int need_sleep = 0; ++ struct timespec limit_time = { 0 }; ++ int need_sleep = 0; +#else + int fd_num = n_cpus + 1; - #endif ++#endif + struct pollfd fds[fd_num]; + memset(&warnonce, 0, sizeof(warnonce)); -@@ -498,7 +507,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, +@@ -496,7 +527,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, if (set_buffer_percent(pdata[0].ras, 0)) log(TERM, LOG_WARNING, "Set buffer_percent failed\n"); - for (i = 0; i < (n_cpus + 1); i++) -+ for (i = 0; i < (fd_num); i++) ++ for (i = 0; i < fd_num; i++) fds[i].fd = -1; for (i = 0; i < n_cpus; i++) { -@@ -529,6 +538,26 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, +@@ -527,6 +558,26 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, goto error; } @@ -216,7 +239,7 @@ index f944847..f5540b3 100644 log(TERM, LOG_INFO, "Listening to events for cpus 0 to %d\n", n_cpus - 1); if (pdata[0].ras->record_events) { if (ras_mc_event_opendb(pdata[0].cpu, pdata[0].ras)) -@@ -540,7 +569,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, +@@ -538,7 +589,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, } do { @@ -225,7 +248,7 @@ index f944847..f5540b3 100644 if (ready < 0) log(TERM, LOG_WARNING, "poll\n"); -@@ -566,6 +595,44 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, +@@ -564,6 +615,40 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, } count_nready = 0; @@ -234,9 +257,9 @@ index f944847..f5540b3 100644 + if (kmsg_monitor && (fds[n_cpus + 1].revents & POLLIN)) { + size = read(fds[n_cpus + 1].fd, kmsg_buf, PRINTK_MESSAGE_MAX); + if (size < 0) { -+ log(TERM, LOG_WARNING, "read kmsg\n"); -+ goto error; ++ log(TERM, LOG_WARNING, "read kmsg %s\n", strerror(errno)); + } else if (size > 0) { ++ kmsg_buf[size] = '\0'; + kmsg_match(kmsg_buf); + memset(kmsg_buf, 0, PRINTK_MESSAGE_MAX); + } else { @@ -244,24 +267,20 @@ index f944847..f5540b3 100644 + } + limit++; + if (kmsg_limit && limit >= kmsg_limit) { -+ clock_t now = clock(); -+ -+ if ((double)(now - limit_time) / CLOCKS_PER_SEC <= 0.5) { -+ need_sleep = 1; -+ log(ALL, LOG_WARNING, "kmsg limit!\n"); ++ struct timespec tv, res; + ++ clock_gettime(CLOCK_MONOTONIC, &tv); + -+ if (lseek(fds[n_cpus + 1].fd, 0, SEEK_END) == -1) { -+ log(TERM, LOG_ERR, "Can not seek kmsg end\n"); -+ goto error; -+ } ++ res = ts_sub(tv, limit_time); ++ if (res.tv_sec == 0 && res.tv_nsec >= 0 && res.tv_nsec < (0.5 * NS_PER_SEC)) { ++ need_sleep = 1; ++ log(TERM, LOG_WARNING, "kmsg limit %lx!\n", res.tv_nsec); + } + + limit = 0; -+ limit_time = now; ++ limit_time = tv; + } + -+ + } else { + count_nready++; + } @@ -270,18 +289,16 @@ index f944847..f5540b3 100644 for (i = 0; i < n_cpus; i++) { if (fds[i].revents & POLLERR) { if (!warnonce[i]) { -@@ -604,20 +671,20 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, +@@ -599,11 +684,18 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, count_nready++; } } --#if 0 -- if (need_sleep) -- sleep(POLLING_TIME); --#else ++#ifdef HAVE_KMSG_MONITOR + if (need_sleep) { + usleep(500000); + need_sleep = 0; + } ++#endif + /* * If we enable fallback mode, it will always be used, as @@ -292,12 +309,7 @@ index f944847..f5540b3 100644 /* Should only happen with legacy kernels */ legacy_kernel = 1; break; - } --#endif - } while (1); - - /* poll() is not supported. We need to fallback to the old way */ -@@ -637,7 +704,7 @@ error: +@@ -627,7 +719,7 @@ error: free(page); sigprocmask(SIG_UNBLOCK, &mask, NULL); @@ -306,7 +318,7 @@ index f944847..f5540b3 100644 if (fds[i].fd > 0) close(fds[i].fd); } -@@ -968,6 +1035,13 @@ int handle_ras_events(int record_events) +@@ -991,6 +1083,13 @@ int handle_ras_events(int record_events, int enable_ipmitool) ras_page_account_init(); #endif @@ -320,11 +332,10 @@ index f944847..f5540b3 100644 rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event", ras_mc_event_handler, NULL, MC_EVENT); if (!rc) -@@ -1148,5 +1222,11 @@ err: - #ifdef HAVE_CPU_FAULT_ISOLATION - cpu_infos_free(); +@@ -1269,5 +1368,10 @@ err: + #ifdef HAVE_MEMORY_ROW_CE_PFA + row_record_infos_free(); #endif -+ +#ifdef HAVE_KMSG_MONITOR + if (kmsg_monitor) + kmsg_tracer_destroy(); @@ -334,28 +345,23 @@ index f944847..f5540b3 100644 } diff --git a/ras-kmsg.c b/ras-kmsg.c new file mode 100644 -index 0000000..0230180 +index 0000000..2dd47d6 --- /dev/null +++ b/ras-kmsg.c -@@ -0,0 +1,209 @@ +@@ -0,0 +1,203 @@ +#define _GNU_SOURCE +#include -+#include +#include +#include -+#include -+#include -+#include -+#include ++ +#include "ras-logger.h" +#include "ras-report.h" -+ +#include "ras-kmsg.h" +#include "trigger.h" + +int kmsg_monitor = 1; +int kmsg_trace_end; -+int kmsg_limit = 0; ++int kmsg_limit; + +struct kmsg_tracer_info *kmsg_tracer; +int kmsg_tracer_num; @@ -536,7 +542,6 @@ index 0000000..0230180 + xid_token = strtok(NULL, ","); + } + kmsg_tracer[i].info.xid.len = c; -+ + } + + free(ignore); @@ -601,35 +606,36 @@ index 0000000..f31125f + +#endif diff --git a/ras-report-json.c b/ras-report-json.c -index f59ca32..b175723 100644 +index b1c33a4..2d35355 100644 --- a/ras-report-json.c +++ b/ras-report-json.c -@@ -11,8 +11,11 @@ +@@ -11,17 +11,17 @@ * GNU General Public License for more details. */ -+#include +#include #include #include +#include #include - #include - #include -@@ -21,7 +24,9 @@ +-#include +-#include +-#include +-#include #include - #include "libtrace/event-parse.h" + #include "traceevent/event-parse.h" +#include "ras-kmsg.h" #include "ras-report.h" +#include "ras-time.h" #define NONE "" int json_report = 1; -@@ -238,3 +243,61 @@ void report_mce_event_json(struct trace_seq *s, struct mce_event *ev) +@@ -236,3 +236,63 @@ void report_mce_event_json(struct trace_seq *s, struct mce_event *ev) ev->apicid); } ++#ifdef HAVE_KMSG_MONITOR +void report_kmsg_event_json(struct kmsg_tracer_info *kmsg_tracer, const char *msg) +{ + struct trace_seq seq; @@ -688,19 +694,20 @@ index f59ca32..b175723 100644 + fflush(stdout); + trace_seq_destroy(&seq); +} ++#endif diff --git a/ras-report.h b/ras-report.h -index fb15dc3..de9bd7a 100644 +index eeb25bb..0564992 100644 --- a/ras-report.h +++ b/ras-report.h -@@ -20,6 +20,7 @@ +@@ -13,6 +13,7 @@ #include "ras-mc-handler.h" - #include "ras-mce-handler.h" - #include "ras-aer-handler.h" + #include "ras-record.h" + #include "types.h" +#include "ras-kmsg.h" /* Maximal length of backtrace. */ #define MAX_BACKTRACE_SIZE (1024 * 1024) -@@ -66,6 +67,7 @@ void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev); +@@ -127,6 +128,7 @@ void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev); void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev); void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev); void report_mce_event_json(struct trace_seq *s, struct mce_event *ev); @@ -710,10 +717,12 @@ index fb15dc3..de9bd7a 100644 #endif diff --git a/ras-time.c b/ras-time.c new file mode 100644 -index 0000000..12da0aa +index 0000000..320f1a1 --- /dev/null +++ b/ras-time.c -@@ -0,0 +1,101 @@ +@@ -0,0 +1,103 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ +#define _GNU_SOURCE +#include +#include @@ -799,7 +808,7 @@ index 0000000..12da0aa + end = msg + strlen(msg) - 1; + + p = skip_item(p, end, ","); -+ p =skip_item(p, end, ",;"); ++ p = skip_item(p, end, ",;"); + + errno = 0; + usec = strtoumax(p, &nu, 10); @@ -815,13 +824,14 @@ index 0000000..12da0aa + + strftime(timestamp, 64, "%Y-%m-%d %H:%M:%S %z", tm); +} -\ No newline at end of file diff --git a/ras-time.h b/ras-time.h new file mode 100644 -index 0000000..4277390 +index 0000000..5dabae8 --- /dev/null +++ b/ras-time.h -@@ -0,0 +1,25 @@ +@@ -0,0 +1,27 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ +#ifndef RAS_TIME_H +#define RAS_TIME_H + @@ -836,37 +846,31 @@ index 0000000..4277390 +extern struct timeval boot_time; +extern time_t suspended_time; + -+extern int get_boot_time(struct timeval *boot_time); ++int get_boot_time(struct timeval *boot_time); + -+extern time_t get_suspended_time(void); ++time_t get_suspended_time(void); + -+extern int gettime_monotonic(struct timeval *tv); ++int gettime_monotonic(struct timeval *tv); + -+extern const char *skip_item(const char *begin, const char *end, const char *sep); ++const char *skip_item(const char *begin, const char *end, const char *sep); + -+extern void get_kmsg_time(const char *msg, char *timestamp); ++void get_kmsg_time(const char *msg, char *timestamp); + +#endif /* RAS_TIME_H */ -\ No newline at end of file diff --git a/rasdaemon.c b/rasdaemon.c -index e0e85c1..02e219a 100644 +index d5d2f85..30dcaf4 100644 --- a/rasdaemon.c +++ b/rasdaemon.c -@@ -22,11 +22,13 @@ - #include - #include - -+#include "ras-time.h" - #include "types.h" - #include "ras-logger.h" +@@ -14,6 +14,8 @@ + #include "ras-erst.h" #include "ras-events.h" + #include "ras-logger.h" ++#include "ras-kmsg.h" ++#include "ras-time.h" + #include "ras-poison-page-stat.h" #include "ras-record.h" #include "ras-report.h" -+#include "ras-kmsg.h" - - /* - * Arguments(argp) handling logic and main -@@ -141,6 +143,13 @@ int main(int argc, char *argv[]) +@@ -154,6 +156,13 @@ int main(int argc, char *argv[]) json_report = 0; #endif @@ -880,18 +884,20 @@ index e0e85c1..02e219a 100644 #ifdef HAVE_MCE const struct argp_option offline_options[] = { {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"}, -@@ -221,6 +230,9 @@ int main(int argc, char *argv[]) - if (daemon(0, 0)) - exit(EXIT_FAILURE); - +@@ -271,6 +280,11 @@ int main(int argc, char *argv[]) + log(ALL, LOG_INFO, "Create pthread to handle NVGPU events.\n"); + } + #endif ++#ifdef HAVE_KMSG_MONITOR + get_boot_time(&boot_time); + suspended_time = get_suspended_time(); ++#endif + - handle_ras_events(args.record_events); + handle_ras_events(args.record_events, args.enable_ipmitool); - return 0; + #ifdef HAVE_NVGPU diff --git a/trigger.c b/trigger.c -index 00f0cbc..391c817 100644 +index 7387113..d410137 100644 --- a/trigger.c +++ b/trigger.c @@ -99,6 +99,8 @@ struct event_trigger aer_fatal_trigger = {"aer_event", "AER_FATAL_TRIGGER"}; @@ -901,9 +907,9 @@ index 00f0cbc..391c817 100644 +struct event_trigger kmsg_trigger = {"kmsg_monitor", "KMSG_TRIGGER"}; + static struct event_trigger *event_triggers[] = { - &mc_ce_trigger, &mc_ue_trigger, -@@ -119,6 +121,9 @@ static struct event_trigger *event_triggers[] = { + #ifdef HAVE_MCE +@@ -117,6 +119,9 @@ static struct event_trigger *event_triggers[] = { &pre_page_offline_trigger, &post_page_offline_trigger, #endif @@ -913,7 +919,7 @@ index 00f0cbc..391c817 100644 }; void setup_event_trigger(const char *event) -@@ -427,3 +432,54 @@ void run_page_offline_trigger(unsigned long long addr, int otype, int type) +@@ -421,3 +426,53 @@ void run_page_offline_trigger(unsigned long long addr, int otype, int type) __run_page_offline_trigger(addr, otype, &pre_page_offline_trigger); } @@ -943,7 +949,6 @@ index 00f0cbc..391c817 100644 + key, (int)(e - s), msg + s) < 0) + goto free; + -+ + if (!strcmp("xid", key) || + !strcmp("sxid", key) || + !strcmp("axid", key)) { @@ -969,10 +974,10 @@ index 00f0cbc..391c817 100644 +} + diff --git a/trigger.h b/trigger.h -index d1b50c3..3354b10 100644 +index 74df3d3..b5a6c2c 100644 --- a/trigger.h +++ b/trigger.h -@@ -2,6 +2,7 @@ +@@ -4,6 +4,7 @@ #define __TRIGGER_H__ #include "ras-record.h" @@ -980,7 +985,7 @@ index d1b50c3..3354b10 100644 enum page_offline_trigger_type { PRE, -@@ -25,5 +26,7 @@ void run_mce_record_trigger(struct mce_event *e); +@@ -27,5 +28,7 @@ void run_mce_record_trigger(struct mce_event *e); void run_mf_event_trigger(struct ras_mf_event *e); void run_aer_event_trigger(struct ras_aer_event *e); void run_page_offline_trigger(unsigned long long addr, int otype, int type); @@ -989,5 +994,5 @@ index d1b50c3..3354b10 100644 #endif -- -2.33.1 +2.43.5 diff --git a/1018-rasdaemon-Fix-poll-on-per_cpu-trace_pipe_raw-blocks-.patch b/1018-rasdaemon-Fix-poll-on-per_cpu-trace_pipe_raw-blocks-.patch deleted file mode 100644 index f8ccc49b77b3a090b114c9ee97fbb1546c7540da..0000000000000000000000000000000000000000 --- a/1018-rasdaemon-Fix-poll-on-per_cpu-trace_pipe_raw-blocks-.patch +++ /dev/null @@ -1,85 +0,0 @@ -From 5a8116e4d54de308fdab7734eebefa71efaf7a59 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Sat, 4 Feb 2023 19:15:55 +0000 -Subject: [PATCH 18/85] rasdaemon: Fix poll() on per_cpu trace_pipe_raw blocks - indefinitely - -The error events are not received in the rasdaemon since kernel 6.1-rc6. -This issue is firstly detected and reported, when testing the CXL error -events in the rasdaemon. - -Debugging showed, poll() on trace_pipe_raw in the ras-events.c do not -return and this issue is seen after the commit -42fb0a1e84ff525ebe560e2baf9451ab69127e2b ("tracing/ring-buffer: Have -polling block on watermark"). - -This issue is also verified using a test application for poll() -and select() on per_cpu trace_pipe_raw. - -There is also a bug reported on this issue, -https://lore.kernel.org/all/31eb3b12-3350-90a4-a0d9-d1494db7cf74@oracle.com/ - -This issue occurs for the per_cpu case, which calls the ring_buffer_poll_wait(), -in kernel/trace/ring_buffer.c, with the buffer_percent > 0 and then wait until -the percentage of pages are available. The default value set for the -buffer_percent is 50 in the kernel/trace/trace.c. However poll() does not return -even met the percentage of pages condition. - -As a fix, rasdaemon set buffer_percent as 0 through the -/sys/kernel/debug/tracing/instances/rasdaemon/buffer_percent, then the -task will wake up as soon as data is added to any of the specific cpu -buffer and poll() on per_cpu/cpuX/trace_pipe_raw does not block -indefinitely. - -Dependency on the kernel fix commit -3e46d910d8acf94e5360126593b68bf4fee4c4a1("tracing: Fix poll() and select() -do not work on per_cpu trace_pipe and trace_pipe_raw") - -Signed-off-by: Shiju Jose ---- - ras-events.c | 22 ++++++++++++++++++++++ - 1 file changed, 22 insertions(+) - -diff --git a/ras-events.c b/ras-events.c -index 2a7d709..f9922d4 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -366,6 +366,8 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, - int warnonce[n_cpus]; - char pipe_raw[PATH_MAX]; - int legacy_kernel = 0; -+ int fd; -+ char buf[16]; - #if 0 - int need_sleep = 0; - #endif -@@ -385,6 +387,26 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, - return -ENOMEM; - } - -+ /* Fix for poll() on the per_cpu trace_pipe and trace_pipe_raw blocks -+ * indefinitely with the default buffer_percent in the kernel trace system, -+ * which is introduced by the following change in the kernel. -+ * https://lore.kernel.org/all/20221020231427.41be3f26@gandalf.local.home/T/#u. -+ * Set buffer_percent to 0 so that poll() will return immediately -+ * when the trace data is available in the ras per_cpu trace pipe_raw -+ */ -+ fd = open_trace(pdata[0].ras, "buffer_percent", O_WRONLY); -+ if (fd >= 0) { -+ /* For the backward compatibility to the old kernels, do not return -+ * if fail to set the buffer_percent. -+ */ -+ snprintf(buf, sizeof(buf), "0"); -+ size = write(fd, buf, strlen(buf)); -+ if (size <= 0) -+ log(TERM, LOG_WARNING, "can't write to buffer_percent\n"); -+ close(fd); -+ } else -+ log(TERM, LOG_WARNING, "Can't open buffer_percent\n"); -+ - for (i = 0; i < (n_cpus + 1); i++) - fds[i].fd = -1; - --- -2.33.1 - diff --git a/1019-rasdaemon-Move-definition-for-BIT-and-BIT_ULL-to-a-c.patch b/1019-rasdaemon-Move-definition-for-BIT-and-BIT_ULL-to-a-c.patch deleted file mode 100644 index dab5a4eab32babe75b7e8c0c96c059fca1b7152b..0000000000000000000000000000000000000000 --- a/1019-rasdaemon-Move-definition-for-BIT-and-BIT_ULL-to-a-c.patch +++ /dev/null @@ -1,49 +0,0 @@ -From 4d0017a4c4e45983b1090884160c7053398879bd Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Mon, 16 Jan 2023 17:13:32 +0000 -Subject: [PATCH 19/85] rasdaemon: Move definition for BIT and BIT_ULL to a - common file - -Move definition for BIT() and BIT_ULL() to the -common file ras-record.h - -Signed-off-by: Shiju Jose -Reviewed-by: Jonathan Cameron -Reviewed-by: Dave Jiang -Signed-off-by: Mauro Carvalho Chehab ---- - ras-non-standard-handler.h | 3 --- - ras-record.h | 3 +++ - 2 files changed, 3 insertions(+), 3 deletions(-) - -diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h -index 57d4cb5..393b756 100644 ---- a/ras-non-standard-handler.h -+++ b/ras-non-standard-handler.h -@@ -17,9 +17,6 @@ - #include "ras-events.h" - #include "libtrace/event-parse.h" - --#define BIT(nr) (1UL << (nr)) --#define BIT_ULL(nr) (1ULL << (nr)) -- - struct ras_ns_ev_decoder { - struct ras_ns_ev_decoder *next; - const char *sec_type; -diff --git a/ras-record.h b/ras-record.h -index 1d1046f..cc49ae2 100644 ---- a/ras-record.h -+++ b/ras-record.h -@@ -26,6 +26,9 @@ - - #define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x))) - -+#define BIT(nr) (1UL << (nr)) -+#define BIT_ULL(nr) (1ULL << (nr)) -+ - extern long user_hz; - - struct ras_events; --- -2.33.1 - diff --git a/1079-rasdaemon-erst-add-erst-mce-erst-dmesg.patch b/1019-rasdaemon-erst-add-erst-mce-erst-dmesg.patch similarity index 65% rename from 1079-rasdaemon-erst-add-erst-mce-erst-dmesg.patch rename to 1019-rasdaemon-erst-add-erst-mce-erst-dmesg.patch index 7a0687ac58920b72e667496ad4b698824512045d..1919377ade51d0e1a4a70b8e0c728124e88e4e11 100644 --- a/1079-rasdaemon-erst-add-erst-mce-erst-dmesg.patch +++ b/1019-rasdaemon-erst-add-erst-mce-erst-dmesg.patch @@ -1,154 +1,94 @@ -From 2a8be760ca0502748f9ee1922942328886eaa745 Mon Sep 17 00:00:00 2001 +From 29c769fa59e73a016aea891476caea98fbf3a27d Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 12 Dec 2024 09:37:06 +0800 -Subject: [PATCH 79/85] rasdaemon: erst: add erst-mce erst-dmesg +Subject: [PATCH 19/30] rasdaemon: erst: add erst-mce erst-dmesg Signed-off-by: Ruidong Tian --- - Makefile.am | 8 +- - configure.ac | 15 + - misc/rasdaemon.env | 2 + - ras-erst.c | 1082 ++++++++++++++++++++++++++++++++++++++++++++ - ras-erst.h | 11 + - ras-mce-handler.c | 2 +- - ras-mce-handler.h | 5 + - ras-record.h | 5 + - ras-report-json.c | 15 +- - rasdaemon.c | 25 + - 10 files changed, 1162 insertions(+), 8 deletions(-) - create mode 100644 ras-erst.c - create mode 100644 ras-erst.h + Makefile.am | 4 +- + configure.ac | 4 + + ras-erst-dmesg.c | 875 +++++++++++++++++++++++++++++++++++++++++++ + ras-erst.c | 18 +- + ras-erst.h | 7 + + ras-record.h | 1 + + ras-report-json.c | 29 +- + ras-report.h | 1 + + ras-signal-handler.c | 3 + + rasdaemon.c | 2 - + 10 files changed, 932 insertions(+), 12 deletions(-) + create mode 100644 ras-erst-dmesg.c diff --git a/Makefile.am b/Makefile.am -index 024757d..5e87894 100644 +index 68b354b..da6ef46 100644 --- a/Makefile.am +++ b/Makefile.am -@@ -75,15 +75,19 @@ endif - if WITH_KMSG_MONITOR - rasdaemon_SOURCES += ras-kmsg.c +@@ -142,7 +142,7 @@ if WITH_POISON_PAGE_STAT + rasdaemon_SOURCES += ras-poison-page-stat.c + endif + if WITH_ERST +- rasdaemon_SOURCES += ras-erst.c ++ rasdaemon_SOURCES += ras-erst.c ras-erst-dmesg.c + endif + + if WITH_NVGPU +@@ -152,7 +152,7 @@ ras-nvgpu-nvml.h: contrib/nvml.py + rasdaemon_SOURCES += ras-nvgpu.c ras-nvgpu-nvml.c endif -+if WITH_ERST -+ rasdaemon_SOURCES += ras-erst.c -+endif --rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a $(PCI_LIBS) -+rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a $(PCI_LIBS) $(ZLIBS) +-rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl ++rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl $(ZLIBS) + rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS) include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ - ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ - ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ - non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ -- ras-cpu-isolation.h queue.h non-standard-yitian.h trigger.h ras-kmsg.h ras-time.h -+ ras-cpu-isolation.h queue.h non-standard-yitian.h trigger.h ras-kmsg.h \ -+ ras-erst.h ras-time.h - - # This rule can't be called with more than one Makefile job (like make -j8) - # I can't figure out a way to fix that diff --git a/configure.ac b/configure.ac -index 2136739..bf44582 100644 +index dfb7f02..68fcb75 100644 --- a/configure.ac +++ b/configure.ac -@@ -206,6 +206,20 @@ AS_IF([test "x$enable_kmsg_monitor" = "xyes" || test "x$enable_all" == "xyes"], - AM_CONDITIONAL([WITH_KMSG_MONITOR], [test x$enable_kmsg_monitor = xyes || test x$enable_all == xyes]) - AM_COND_IF([WITH_KMSG_MONITOR], [USE_KMSG_MONITOR="yes"], [USE_KMSG_MONITOR="no"]) +@@ -287,12 +287,16 @@ AC_ARG_ENABLE([erst], + AS_HELP_STRING([--enable-erst], [enable erst (currently experimental)])) -+AC_ARG_ENABLE([erst], -+ AS_HELP_STRING([--enable-erst], [enable erst (currently experimental)])) -+ -+AS_IF([test "x$enable_erst" = "xyes" || test "x$enable_all" == "xyes"], [ + AS_IF([test "x$enable_erst" = "xyes" || test "x$enable_all" == "xyes"], [ + AC_CHECK_LIB(z, inflate,[echo "found zlib"] , AC_MSG_ERROR([*** Unable to find zlib library]), ) + ZLIBS="-lz" -+ AC_DEFINE(HAVE_ERST,1,"have ERST") -+ AC_SUBST([WITH_ERST]) -+]) -+AM_CONDITIONAL([WITH_ERST], [test x$enable_erst = xyes || test x$enable_all == xyes]) -+AM_COND_IF([WITH_ERST], [USE_ERST="yes"], [USE_ERST="no"]) -+ + AC_DEFINE(HAVE_ERST,1,"have ERST") + AC_SUBST([WITH_ERST]) + ]) + AM_CONDITIONAL([WITH_ERST], [test x$enable_erst = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_ERST], [USE_ERST="yes"], [USE_ERST="no"]) + +AC_SUBST([ZLIBS]) + - test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + AC_ARG_ENABLE([nvgpu], + AS_HELP_STRING([--enable-nvgpu], [enable NVGPU events])) - CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" -@@ -250,4 +264,5 @@ compile time options summary - YITIAN RAS errors : $USE_YITIAN_NS_DECODE - Json exporter : $USE_JSON_REPORT - Kmsg monitor : $USE_KMSG_MONITOR -+ ERST : $USE_ERST - EOF -diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 2232a29..efb109e 100644 ---- a/misc/rasdaemon.env -+++ b/misc/rasdaemon.env -@@ -98,6 +98,8 @@ POST_PAGE_OFFLINE_TRIGGER_TIMEOUT=0 - KMSG_TRIGGER= - KMSG_TRIGGER_TIMEOUT=0 - -+ERST_DELETE=1 -+ - # KMSG MONITOR - KMSG_IGNORE_XID="" - KMSG_LIMIT=0 -diff --git a/ras-erst.c b/ras-erst.c +diff --git a/ras-erst-dmesg.c b/ras-erst-dmesg.c new file mode 100644 -index 0000000..ebf6ae4 +index 0000000..ce61a6a --- /dev/null -+++ b/ras-erst.c -@@ -0,0 +1,1082 @@ -+#include -+#include -+#include -+#include -+#include ++++ b/ras-erst-dmesg.c +@@ -0,0 +1,875 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++ ++/* ++* Copyright (C) 2025 Alibaba Inc ++*/ ++ +#include -+#include -+#include ++#include +#include -+#include +#include -+#include -+#include +#include -+#include -+#include "libtrace/event-parse.h" ++#include ++ ++#include "bitfield.h" +#include "ras-events.h" ++#include "ras-erst.h" +#include "ras-logger.h" +#include "ras-mce-handler.h" -+#include "ras-aer-handler.h" -+#include "bitfield.h" ++#include "ras-record.h" +#include "ras-report.h" +#include "types.h" + -+#include "ras-erst.h" -+ -+struct mce { -+ uint64_t status; /* Bank's MCi_STATUS MSR */ -+ uint64_t misc; /* Bank's MCi_MISC MSR */ -+ uint64_t addr; /* Bank's MCi_ADDR MSR */ -+ uint64_t mcgstatus; /* Machine Check Global Status MSR */ -+ uint64_t ip; /* Instruction Pointer when the error happened */ -+ uint64_t tsc; /* CPU time stamp counter */ -+ uint64_t time; /* Wall time_t when error was detected */ -+ uint8_t cpuvendor; /* Kernel's X86_VENDOR enum */ -+ uint8_t inject_flags; /* Software inject flags */ -+ uint8_t severity; /* Error severity */ -+ uint8_t pad; -+ uint32_t cpuid; /* CPUID 1 EAX */ -+ uint8_t cs; /* Code segment */ -+ uint8_t bank; /* Machine check bank reporting the error */ -+ uint8_t cpu; /* CPU number; obsoleted by extcpu */ -+ uint8_t finished; /* Entry is valid */ -+ uint32_t extcpu; /* Linux CPU number that detected the error */ -+ uint32_t socketid; /* CPU socket ID */ -+ uint32_t apicid; /* CPU initial APIC ID */ -+ uint64_t mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ -+ uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */ -+ uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */ -+ uint64_t ppin; /* Protected Processor Inventory Number */ -+ uint32_t microcode; /* Microcode revision */ -+ uint64_t kflags; /* Internal kernel use */ -+}; -+ +struct apei_regex { + regex_t hdr; + regex_t severity; @@ -216,16 +156,7 @@ index 0000000..ebf6ae4 + }; +}; + -+int erst_mce_enable = 1; -+int erst_panic_enable = 1; -+int erst_delete = 0; -+time_t last_reboot_time = 0; -+ -+#define ERST_PATH "/sys/fs/pstore/" -+#define MCE_ERST_PREFIX "mce-erst" -+#define ERST_EVENT_NAME "mce_erst_record" -+#define ERST_PANIC_NAME "dmesg_erst_record" -+#define LAST_REBOOT_INDEX 2 ++time_t last_reboot_time; + +static void get_last_reboot_time(void) +{ @@ -273,170 +204,6 @@ index 0000000..ebf6ae4 + return; +} + -+#ifdef HAVE_MCE -+static void ras_erst_mce_handler(struct ras_events *ras, struct mce_event *e) -+{ -+ struct mce_priv *mce = ras->mce_priv; -+ struct trace_seq s; -+ int rc = 0, len; -+ static char *spaces = " "; /* 20 spaces */ -+ -+ switch (mce->cputype) { -+ case CPU_GENERIC: -+ break; -+ case CPU_K8: -+ rc = parse_amd_k8_event(ras, e); -+ break; -+ case CPU_AMD_SMCA: -+ case CPU_DHYANA: -+ rc = parse_amd_smca_event(ras, e); -+ break; -+ default: /* All other CPU types are Intel */ -+ rc = parse_intel_event(ras, e); -+ } -+ -+ if (rc) -+ return; -+ -+ if (!e->error_msg && e->mcastatus_msg) -+ mce_snprintf(e->error_msg, "%s", e->mcastatus_msg); -+ -+ trace_seq_init(&s); -+ -+ trace_seq_printf(&s, " %s: ", ERST_EVENT_NAME); -+ -+ len = strlen(ERST_EVENT_NAME); -+ if (len < 20) -+ trace_seq_printf(&s, "%.*s", 20 - len, spaces); -+ -+ report_mce_event(ras, NULL, &s, e); -+ -+#ifdef HAVE_JSON_REPORT -+ strftime(e->timestamp, sizeof(e->timestamp), -+ "%Y-%m-%d %H:%M:%S %z", localtime((time_t *)&e->walltime)); -+ -+ report_mce_event_json(&s, e); -+#endif -+ -+ trace_seq_terminate(&s); -+ -+ trace_seq_do_printf(&s); -+ printf("\n"); -+ fflush(stdout); -+ trace_seq_destroy(&s); -+} -+ -+static void handle_erst_mce_file(const char *dir_name, const char *d_name, struct ras_events *ras) -+{ -+ char file_path[512]; -+ FILE *file; -+ struct mce mce; -+ struct mce_event e = { 0 }; -+ int rc; -+ -+ if (strncmp(d_name, MCE_ERST_PREFIX, strlen(MCE_ERST_PREFIX))) -+ return; -+ -+ snprintf(file_path, sizeof(file_path), "%s/%s", dir_name, d_name); -+ -+ file = fopen(file_path, "r"); -+ if (!file) { -+ log(ALL, LOG_INFO, "Failed to open file %s\n", file_path); -+ return; -+ } -+ -+ rc = fread((char *)&mce, 1, sizeof(mce), file); -+ if (rc < sizeof(mce)) { -+ log(ALL, LOG_ERR, "Failed to read file"); -+ fclose(file); -+ } -+ -+ e.mcgcap = mce.mcgcap; -+ e.mcgstatus = mce.mcgstatus; -+ -+ e.status = mce.status; -+ e.addr = mce.addr; -+ e.misc = mce.misc; -+ e.synd = mce.synd; -+ e.ipid = mce.ipid; -+ e.ip = mce.ip; -+ e.tsc = mce.tsc; -+ e.walltime = mce.time; -+ e.cpu = mce.extcpu; -+ e.cpuid = mce.cpuid; -+ e.apicid = mce.apicid; -+ e.socketid = mce.socketid; -+ e.cs = mce.cs; -+ e.bank = mce.bank; -+ e.cpuvendor = mce.cpuvendor; -+ e.erst = 1; -+ -+ ras_erst_mce_handler(ras, &e); -+ -+ fclose(file); -+ -+ if (erst_delete && unlink(file_path)) { -+ log(ALL, LOG_INFO, "Error deleting file %s\n", file_path); -+ return; -+ } -+} -+ -+void handle_erst_mce(void) -+{ -+ struct ras_events ras = { 0 }; -+ int rc; -+ -+ if (!last_reboot_time) -+ get_last_reboot_time(); -+ -+ rc = register_mce_handler(&ras, sysconf(_SC_NPROCESSORS_CONF)); -+ if (rc) { -+ log(ALL, LOG_INFO, "Can't register mce handler\n"); -+ return; -+ } -+ -+ if (!ras.mce_priv) { -+ log(ALL, LOG_INFO, "Register mce handler failed\n"); -+ return; -+ } -+ -+ DIR *dir = opendir(ERST_PATH); -+ -+ if (!dir) { -+ log(ALL, LOG_INFO, "Failed to open directory"); -+ return; -+ } -+ -+ struct dirent *entry; -+ -+ while ((entry = readdir(dir)) != NULL) { -+ struct stat path_stat; -+ char file_path[MAX_PATH]; -+ -+ snprintf(file_path, sizeof(file_path), "%s/%s", ERST_PATH, entry->d_name); -+ stat(file_path, &path_stat); -+ -+ if (S_ISDIR(path_stat.st_mode) && !strncmp("erst", entry->d_name, sizeof("erst"))) { -+ DIR *subdir = opendir(file_path); -+ struct dirent *subentry; -+ -+ if (!subdir) { -+ log(ALL, LOG_INFO, "Failed to open directory %s\n", strerror(errno)); -+ break; -+ } -+ while ((subentry = readdir(subdir)) != NULL) -+ handle_erst_mce_file(file_path, subentry->d_name, &ras); -+ -+ closedir(subdir); -+ -+ } else -+ handle_erst_mce_file(ERST_PATH, entry->d_name, &ras); -+ } -+ -+ closedir(dir); -+} -+#endif -+ +#define DMESG_ERST_PREFIX "dmesg-erst" +#define DMESG_ERST_SUFFIX "enc.z" + @@ -465,10 +232,8 @@ index 0000000..ebf6ae4 +#define APEI_ARM_MIDR ".*\\{([0-9]+)\\}\\[Hardware Error\\]: MIDR: (.*)" +#define APEI_ARM_MPIDR ".*\\{([0-9]+)\\}\\[Hardware Error\\]: Multiprocessor Affinity Register \\(MPIDR\\): (.*)" + -+#define ERST_DELETE_FILE "ERST_DELETE_FILE" -+ +static int decompress_deflate(const char *compressed_data, ssize_t compressed_data_size, -+ char *decompressed_data, ssize_t *decompressed_data_size, z_stream *zstream) ++ char *decompressed_data, ssize_t *decompressed_data_size, z_stream *zstream) +{ + int ret = Z_OK; + @@ -607,7 +372,7 @@ index 0000000..ebf6ae4 + ev.tlp_header_valid = (apei->pcie.tlp_hdr != NULL); + if (ev.tlp_header_valid) + snprintf((buf + strlen(ev.msg)), 1024 - strlen(ev.msg), -+ " TLP Header: %s", apei->pcie.tlp_hdr); ++ " TLP Header: %s", apei->pcie.tlp_hdr); + + ev.severity = apei->sev; + switch (apei->sev) { @@ -1177,109 +942,105 @@ index 0000000..ebf6ae4 + + inflateEnd(&zstream); +} -diff --git a/ras-erst.h b/ras-erst.h -new file mode 100644 -index 0000000..96ab58a ---- /dev/null -+++ b/ras-erst.h -@@ -0,0 +1,11 @@ -+#define ERST_DELETE "ERST_DELETE" -+ -+extern int erst_mce_enable; -+extern int erst_panic_enable; -+extern int erst_delete; +diff --git a/ras-erst.c b/ras-erst.c +index c024d60..a0ece1b 100644 +--- a/ras-erst.c ++++ b/ras-erst.c +@@ -14,6 +14,8 @@ + #include "ras-logger.h" + #include "ras-mce-handler.h" + #include "ras-record.h" ++#include "ras-report.h" ++#include "ras-time.h" + #include "types.h" + + struct mce { +@@ -43,11 +45,7 @@ struct mce { + uint32_t microcode; /* Microcode revision */ + }; + +-static int erst_delete; +- +-#define ERST_PATH "/sys/fs/pstore/erst" +-#define MCE_ERST_PREFIX "mce-erst" +-#define ERST_EVENT_NAME "mce_erst_record" ++int erst_delete; + + #ifdef HAVE_MCE + static void ras_erst_mce_handler(struct ras_events *ras, struct mce_event *e) +@@ -80,6 +78,9 @@ static void ras_erst_mce_handler(struct ras_events *ras, struct mce_event *e) + "<...>", 0, -1, "....", 0.0f, ERST_EVENT_NAME); + + report_mce_event(ras, NULL, &s, e); ++#ifdef HAVE_JSON_REPORT ++ report_mce_event_json(&s, e); ++#endif + trace_seq_terminate(&s); + trace_seq_do_printf(&s); + printf("\n"); +@@ -188,8 +189,15 @@ static void handle_erst_mce(void) + /* ERST just support mce now */ + void handle_erst(void) + { ++ get_boot_time(&boot_time); ++ suspended_time = get_suspended_time(); + + if (getenv(ERST_DELETE)) + erst_delete = atoi(getenv(ERST_DELETE)); + +#ifdef HAVE_MCE -+void handle_erst_mce(void); + handle_erst_mce(); +#endif + -+void handle_erst_panic(void); -diff --git a/ras-mce-handler.c b/ras-mce-handler.c -index ecc6468..686c308 100644 ---- a/ras-mce-handler.c -+++ b/ras-mce-handler.c -@@ -275,7 +275,7 @@ int register_mce_handler(struct ras_events *ras, unsigned int ncpus) - * End of mcelog's code - */ - --static void report_mce_event(struct ras_events *ras, -+void report_mce_event(struct ras_events *ras, - struct pevent_record *record, - struct trace_seq *s, struct mce_event *e) - { -diff --git a/ras-mce-handler.h b/ras-mce-handler.h -index f0dbdab..df24be9 100644 ---- a/ras-mce-handler.h -+++ b/ras-mce-handler.h -@@ -77,6 +77,7 @@ struct mce_event { - uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */ ++ handle_erst_panic(); + } +diff --git a/ras-erst.h b/ras-erst.h +index 83d7535..29a5587 100644 +--- a/ras-erst.h ++++ b/ras-erst.h +@@ -8,10 +8,17 @@ + #define __RAS_ERST_H - int severity; -+ int erst; - /* Parsed data */ - char timestamp[64]; - char bank_name[64]; -@@ -178,4 +179,8 @@ int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e); + #define ERST_DELETE "ERST_DELETE" ++#define ERST_PATH "/sys/fs/pstore/erst" ++#define MCE_ERST_PREFIX "mce-erst" ++#define ERST_EVENT_NAME "mce_erst_record" ++#define ERST_PANIC_NAME "dmesg_erst_record" ++#define LAST_REBOOT_INDEX 2 - int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e); ++extern int erst_delete; + #ifdef HAVE_MCE + void handle_erst_mce(void); + #endif -+void report_mce_event(struct ras_events *ras, -+ struct pevent_record *record, -+ struct trace_seq *s, struct mce_event *e); -+ + void handle_erst(void); ++void handle_erst_panic(void); #endif diff --git a/ras-record.h b/ras-record.h -index f48fe37..17cc981 100644 +index 7f49b74..416f679 100644 --- a/ras-record.h +++ b/ras-record.h -@@ -45,6 +45,7 @@ struct ras_mc_event { - signed char top_layer, middle_layer, lower_layer; - unsigned long long address, grain, syndrome; - const char *driver_detail; -+ int erst; - }; - - struct ras_mc_offline_event { -@@ -64,6 +65,9 @@ struct ras_aer_event { - uint8_t tlp_header_valid; - uint32_t *tlp_header; - const char *msg; -+ int erst; -+ uint16_t vendor_id; -+ uint16_t device_id; - }; - - struct ras_extlog_event { -@@ -101,6 +105,7 @@ struct ras_arm_event { - uint32_t ctx_len; - const uint8_t *vsei_error; - uint32_t oem_len; +@@ -101,6 +101,7 @@ struct ras_arm_event { + uint64_t error_info; + uint64_t virt_fault_addr; + uint64_t phy_fault_addr; + int erst; }; struct devlink_event { diff --git a/ras-report-json.c b/ras-report-json.c -index b175723..1c3b571 100644 +index 2d35355..e28cfac 100644 --- a/ras-report-json.c +++ b/ras-report-json.c -@@ -37,7 +37,7 @@ void report_mc_event_json(struct trace_seq *s, struct ras_mc_event *ev) - return; - - trace_seq_printf(s, -- "\n{ \"%s\": \"mc_event\", " \ -+ "\n{ \"%s\": \"%s\", " \ - "\"timestamp\": \"%s\", " \ - "\"severity\": \"%s\", " \ - "\"error_count\": %d, " \ -@@ -50,6 +50,7 @@ void report_mc_event_json(struct trace_seq *s, struct ras_mc_event *ev) - "\"syndrome\": \"%#llx\", " \ +@@ -45,6 +45,7 @@ void report_mc_event_json(struct trace_seq *s, struct ras_mc_event *ev) + "\"syndrome\": \"%#llx\", " "\"driver_detail\": \"%s\" }", JSON_REPORT_KEY, + ev->erst ? "erst_mc_event" : "mc_event", (*ev->timestamp) ? ev->timestamp : NONE, severity_strs[ev->severity], ev->error_count, -@@ -121,7 +122,7 @@ void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev) +@@ -114,7 +115,7 @@ void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev) get_pci_dev_name(ev->dev_name, pci_name, 128, &vendor, &device); trace_seq_printf(s, @@ -1288,7 +1049,7 @@ index b175723..1c3b571 100644 "\"timestamp\": \"%s\", " \ "\"severity\": \"%s\", " \ "\"error_type\": \"%s\", " \ -@@ -131,12 +132,14 @@ void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev) +@@ -124,12 +125,14 @@ void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev) "\"device_id\": \"%#x\", " \ "\"msg\": \"%s\" }", JSON_REPORT_KEY, @@ -1300,11 +1061,11 @@ index b175723..1c3b571 100644 (*pci_name) ? pci_name : NONE, - vendor, device, + ev->vendor_id ? ev->vendor_id : vendor, -+ ev->device_id ? ev->device_id: device, ++ ev->device_id ? ev->device_id : device, (ev->msg) ? ev->msg : NONE); } -@@ -146,7 +149,7 @@ void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev) +@@ -139,7 +142,7 @@ void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev) return; trace_seq_printf(s, @@ -1313,7 +1074,7 @@ index b175723..1c3b571 100644 "\"timestamp\": \"%s\", " \ "\"error_count\": %d, " \ "\"affinity\": %d, " \ -@@ -155,6 +158,7 @@ void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev) +@@ -148,6 +151,7 @@ void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev) "\"running_state\": %d, " \ "\"psci_state\": %d }", JSON_REPORT_KEY, @@ -1321,73 +1082,83 @@ index b175723..1c3b571 100644 (*ev->timestamp) ? ev->timestamp : NONE, ev->error_count, ev->affinity, -@@ -193,7 +197,7 @@ void report_mce_event_json(struct trace_seq *s, struct mce_event *ev) - ev->severity = GHES_SEV_CORRECTED; +@@ -173,6 +177,24 @@ void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev) + (ev->action_result) ? ev->action_result : NONE); + } - trace_seq_printf(s, -- "\n{ \"%s\": \"mce_record\", " \ -+ "\n{ \"%s\": \"%s\", " \ - "\"timestamp\": \"%s\", " \ - "\"severity\": \"%s\", " \ - "\"bank\": %d, " \ -@@ -218,6 +222,7 @@ void report_mce_event_json(struct trace_seq *s, struct mce_event *ev) - "\"mcgcap\": \"%#lx\", " \ ++void report_signal_event_json(struct trace_seq *s, struct ras_signal_event *ev) ++{ ++ if (!s || !ev || !json_report) ++ return; ++ ++ trace_seq_printf(s, ++ "\n{ \"%s\": \"signal_event\", \"timestamp\": \"%s\", " \ ++ "\"signo\": %d, \"sigerr\": %d, " \ ++ "\"sigcode\": %d, \"comm\": \"%s\", " \ ++ "\"pid\": %d, \"group\": %d, " \ ++ "\"result\": %d }", ++ JSON_REPORT_KEY, ++ (*ev->timestamp) ? ev->timestamp : NONE, ++ ev->sig, ev->error_no, ev->code, ++ (ev->comm) ? ev->comm : NONE, ++ ev->pid, ev->group, ev->result); ++} ++ + void report_mce_event_json(struct trace_seq *s, struct mce_event *ev) + { + if (!s || !ev || !json_report) +@@ -211,6 +233,7 @@ void report_mce_event_json(struct trace_seq *s, struct mce_event *ev) + "\"mcgcap\": \"%#lx\", " "\"apicid\": \"%#x\" }", JSON_REPORT_KEY, + ev->erst ? "erst_mce_record" : "mce_record", (*ev->timestamp) ? ev->timestamp : NONE, severity_strs[ev->severity], ev->bank, -diff --git a/rasdaemon.c b/rasdaemon.c -index 02e219a..987c544 100644 ---- a/rasdaemon.c -+++ b/rasdaemon.c -@@ -29,6 +29,7 @@ - #include "ras-record.h" - #include "ras-report.h" - #include "ras-kmsg.h" -+#include "ras-erst.h" +diff --git a/ras-report.h b/ras-report.h +index 0564992..7f7f304 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -129,6 +129,7 @@ void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev); + void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev); + void report_mce_event_json(struct trace_seq *s, struct mce_event *ev); + void report_kmsg_event_json(struct kmsg_tracer_info *kmsg_tracer, const char *msg); ++void report_signal_event_json(struct trace_seq *s, struct ras_signal_event *ev); + #endif - /* - * Arguments(argp) handling logic and main -@@ -150,6 +151,21 @@ int main(int argc, char *argv[]) - kmsg_monitor = 0; #endif +diff --git a/ras-signal-handler.c b/ras-signal-handler.c +index d15c4f6..0d999a6 100644 +--- a/ras-signal-handler.c ++++ b/ras-signal-handler.c +@@ -130,6 +130,9 @@ int ras_signal_event_handler(struct trace_seq *s, struct tep_record *record, -+#ifdef HAVE_ERST -+ if (choices_disable != NULL && -+ strlen(choices_disable) != 0 && -+ strstr(choices_disable, "erst_mce")) -+ erst_mce_enable = 0; -+ -+ if (choices_disable != NULL && -+ strlen(choices_disable) != 0 && -+ strstr(choices_disable, "erst_panic")) -+ erst_panic_enable = 0; -+ -+ if (getenv(ERST_DELETE)) -+ erst_delete = atoi(getenv(ERST_DELETE)); -+#endif -+ - #ifdef HAVE_MCE - const struct argp_option offline_options[] = { - {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"}, -@@ -233,6 +249,15 @@ int main(int argc, char *argv[]) - get_boot_time(&boot_time); - suspended_time = get_suspended_time(); + report_ras_signal_event(s, &ev); -+#ifdef HAVE_ERST -+ if (erst_panic_enable) -+ handle_erst_panic(); -+#ifdef HAVE_MCE -+ if (erst_mce_enable) -+ handle_erst_mce(); -+#endif ++#ifdef HAVE_JSON_REPORT ++ report_signal_event_json(s, &ev); +#endif -+ - handle_ras_events(args.record_events); + /* Store data into the SQLite DB */ + #ifdef HAVE_SQLITE3 + ras_store_signal_event(ras, &ev); +diff --git a/rasdaemon.c b/rasdaemon.c +index 30dcaf4..335c047 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -247,13 +247,11 @@ int main(int argc, char *argv[]) + exit(EXIT_FAILURE); - return 0; + #ifdef HAVE_ERST +-#ifdef HAVE_MCE + if (choices_disable && strlen(choices_disable) != 0 && + strstr(choices_disable, "ras:erst")) + log(ALL, LOG_INFO, "Disabled ras:erst from config\n"); + else + handle_erst(); +-#endif + #endif + if (getenv(PCIE_EDPC_ENABLE) && atoi(getenv(PCIE_EDPC_ENABLE))) + config_pcie_edpc(); -- -2.33.1 +2.43.5 diff --git a/1020-Check-CPUs-online-not-configured.patch b/1020-Check-CPUs-online-not-configured.patch deleted file mode 100644 index d5cfd5bbc2f7815790d884a3a04118dd9cd420f0..0000000000000000000000000000000000000000 --- a/1020-Check-CPUs-online-not-configured.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 3b6f8473b12885db7e1ac2e467ccbdac913c629c Mon Sep 17 00:00:00 2001 -From: Zeph / Liz Loss-Cutler-Hull -Date: Sun, 9 Jul 2023 04:57:19 -0700 -Subject: [PATCH 20/85] Check CPUs online, not configured. - -When the number of CPUs detected is greater than the number of CPUs in -the system, rasdaemon will crash when it receives some events. - -Looking deeper, we also fail to use the poll method for similar reasons -in this case. - -All of this can be prevented by checking to see how many CPUs are -currently online (sysconf(_SC_NPROCESSORS_ONLN)) instead of how many -CPUs the current kernel was configured to support -(sysconf(_SC_NPROCESSORS_CONF)). - -For the kernel side of the discussion, see https://lore.kernel.org/lkml/CAM6Wdxft33zLeeXHhmNX5jyJtfGTLiwkQSApc=10fqf+rQh9DA@mail.gmail.com/T/ -Signed-off-by: Mauro Carvalho Chehab ---- - ras-events.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/ras-events.c b/ras-events.c -index f9922d4..9ad34f8 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -328,7 +328,7 @@ static void parse_ras_data(struct pthread_data *pdata, struct kbuffer *kbuf, - - static int get_num_cpus(struct ras_events *ras) - { -- return sysconf(_SC_NPROCESSORS_CONF); -+ return sysconf(_SC_NPROCESSORS_ONLN); - #if 0 - char fname[MAX_PATH + 1]; - int num_cpus = 0; --- -2.33.1 - diff --git a/1082-anolis-rasdaemon-add-amdgpu-ras-error-monitor.patch b/1020-anolis-rasdaemon-add-amdgpu-ras-error-monitor.patch similarity index 86% rename from 1082-anolis-rasdaemon-add-amdgpu-ras-error-monitor.patch rename to 1020-anolis-rasdaemon-add-amdgpu-ras-error-monitor.patch index 29d4567a720288f3f0386ff8ff8c143962656f51..26de833c4c534b61ca8a37449dd805720f22228d 100644 --- a/1082-anolis-rasdaemon-add-amdgpu-ras-error-monitor.patch +++ b/1020-anolis-rasdaemon-add-amdgpu-ras-error-monitor.patch @@ -1,7 +1,7 @@ -From 65e6c233804512a40f4626d86a0f3de0041f403b Mon Sep 17 00:00:00 2001 +From e58b2e2c034ecfd6de044d8daee6d66a18b1ea3c Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Tue, 17 Dec 2024 09:36:55 +0800 -Subject: [PATCH 82/85] anolis: rasdaemon: add amdgpu ras error monitor +Subject: [PATCH 20/30] anolis: rasdaemon: add amdgpu ras error monitor Signed-off-by: Ruidong Tian --- @@ -13,44 +13,43 @@ Signed-off-by: Ruidong Tian ras-kmsg.h | 25 ++++++ ras-mce-handler.c | 3 + ras-record.h | 3 + - ras-report-json.c | 80 +++++++++++++++++ + ras-report-json.c | 81 +++++++++++++++++ ras-report.h | 3 + - rasdaemon.c | 2 + - 11 files changed, 342 insertions(+), 1 deletion(-) + 10 files changed, 341 insertions(+), 1 deletion(-) create mode 100644 ras-kmsg-amdgpu.c diff --git a/Makefile.am b/Makefile.am -index 3efcd9e..3d0a315 100644 +index da6ef46..328fa49 100644 --- a/Makefile.am +++ b/Makefile.am -@@ -73,7 +73,7 @@ if WITH_YITIAN_NS_DECODE - rasdaemon_SOURCES += non-standard-yitian.c +@@ -135,7 +135,7 @@ if WITH_SIGNAL + rasdaemon_SOURCES += ras-signal-handler.c endif if WITH_KMSG_MONITOR -- rasdaemon_SOURCES += ras-kmsg.c -+ rasdaemon_SOURCES += ras-kmsg.c ras-kmsg-amdgpu.c +- rasdaemon_SOURCES += ras-kmsg.c ras-time.c ++ rasdaemon_SOURCES += ras-kmsg.c ras-time.c ras-kmsg-amdgpu.c endif - if WITH_ERST - rasdaemon_SOURCES += ras-erst.c + + if WITH_POISON_PAGE_STAT diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 1287183..c001afb 100644 +index f498e24..2816505 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env -@@ -103,6 +103,7 @@ SIGNAL_TRIGGER= - SIGNAL_TRIGGER_TIMEOUT=0 +@@ -131,6 +131,7 @@ MC_CE_STAT_THRESHOLD=2000 + POISON_STAT_THRESHOLD=102400 ERST_DELETE=1 -+AMDGPU_MCA_ENABLED=1 ++AMDGPU_MCA_ENABLED=0 - # KMSG MONITOR - KMSG_IGNORE_XID="" + # EDPC config + # diff --git a/ras-events.c b/ras-events.c -index f61b155..845e879 100644 +index d40f29e..88c83df 100644 --- a/ras-events.c +++ b/ras-events.c -@@ -609,6 +609,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, - goto error; +@@ -624,6 +624,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, } else if (size > 0) { + kmsg_buf[size] = '\0'; kmsg_match(kmsg_buf); + amdgpu_tracer_match(kmsg_buf); memset(kmsg_buf, 0, PRINTK_MESSAGE_MAX); @@ -58,7 +57,7 @@ index f61b155..845e879 100644 count_nready++; diff --git a/ras-kmsg-amdgpu.c b/ras-kmsg-amdgpu.c new file mode 100644 -index 0000000..0d9900c +index 0000000..c46525a --- /dev/null +++ b/ras-kmsg-amdgpu.c @@ -0,0 +1,219 @@ @@ -71,7 +70,7 @@ index 0000000..0d9900c +#include +#include +#include -+#include ++#include +#include "ras-logger.h" +#include "ras-report.h" + @@ -283,10 +282,10 @@ index 0000000..0d9900c +} \ No newline at end of file diff --git a/ras-kmsg.c b/ras-kmsg.c -index 0230180..c288f26 100644 +index 2dd47d6..deeb475 100644 --- a/ras-kmsg.c +++ b/ras-kmsg.c -@@ -77,6 +77,8 @@ int kmsg_tracer_destroy(void) +@@ -72,6 +72,8 @@ int kmsg_tracer_destroy(void) } free(kmsg_tracer); @@ -295,7 +294,7 @@ index 0230180..c288f26 100644 return 0; } -@@ -87,6 +89,8 @@ int kmsg_tracer_init(void) +@@ -82,6 +84,8 @@ int kmsg_tracer_init(void) char buf[1026], *kmsg_tracer_name, *kmsg_tracer_regex, *tmp; char *kmsg_tracer_group_key, *token; @@ -350,10 +349,10 @@ index f31125f..9e34da5 100644 + #endif diff --git a/ras-mce-handler.c b/ras-mce-handler.c -index 686c308..e53854d 100644 +index b61976a..fc2e8d4 100644 --- a/ras-mce-handler.c +++ b/ras-mce-handler.c -@@ -475,6 +475,9 @@ int ras_offline_mce_event(struct ras_mc_offline_event *event) +@@ -491,6 +491,9 @@ int ras_offline_mce_event(struct ras_mc_offline_event *event) trace_seq_init(&s); report_mce_offline(&s, mce, priv); @@ -364,10 +363,10 @@ index 686c308..e53854d 100644 fflush(stdout); trace_seq_destroy(&s); diff --git a/ras-record.h b/ras-record.h -index 91f9d1c..42fecb8 100644 +index 416f679..d0230f7 100644 --- a/ras-record.h +++ b/ras-record.h -@@ -56,6 +56,9 @@ struct ras_mc_offline_event { +@@ -46,6 +46,9 @@ struct ras_mc_offline_event { uint64_t ipid; uint64_t synd; uint64_t status; @@ -378,13 +377,13 @@ index 91f9d1c..42fecb8 100644 struct ras_aer_event { diff --git a/ras-report-json.c b/ras-report-json.c -index 8dbcd90..6508b60 100644 +index e28cfac..577e856 100644 --- a/ras-report-json.c +++ b/ras-report-json.c -@@ -324,3 +324,83 @@ out: - fflush(stdout); +@@ -319,3 +319,84 @@ out: trace_seq_destroy(&seq); } + #endif + +void report_mce_offline_json(struct trace_seq *s, struct mce_event *mce, + struct ras_mc_offline_event *e) @@ -407,6 +406,7 @@ index 8dbcd90..6508b60 100644 + "\"mcistatus_msg\": \"%s\", " \ + "\"mc_location\": \"%s\", " \ + "\"error_msg\": \"%s\", " \ ++ "\"pci_bdf\": \"%s\", " \ + "\"pci_dev_name\": \"%s\", " \ + "\"vendor_id\": \"%#x\", " \ + "\"device_id\": \"%#x\", " \ @@ -423,7 +423,7 @@ index 8dbcd90..6508b60 100644 + (*mce->mcistatus_msg) ? mce->mcistatus_msg : NONE, + (*mce->mc_location) ? mce->mc_location : NONE, + (*mce->error_msg) ? mce->error_msg : NONE, -+ pci_name, vendor, device, ++ tmpbuf, pci_name, vendor, device, + e->status, e->addr, e->misc0, e->ipid, e->synd); +} + @@ -465,12 +465,11 @@ index 8dbcd90..6508b60 100644 + fflush(stdout); + trace_seq_destroy(&seq); +} -\ No newline at end of file diff --git a/ras-report.h b/ras-report.h -index 98c4542..b2cd97d 100644 +index 7f7f304..7066a74 100644 --- a/ras-report.h +++ b/ras-report.h -@@ -69,6 +69,9 @@ void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev); +@@ -130,6 +130,9 @@ void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev); void report_mce_event_json(struct trace_seq *s, struct mce_event *ev); void report_kmsg_event_json(struct kmsg_tracer_info *kmsg_tracer, const char *msg); void report_signal_event_json(struct trace_seq *s, struct ras_signal_event *ev); @@ -480,19 +479,6 @@ index 98c4542..b2cd97d 100644 #endif #endif -diff --git a/rasdaemon.c b/rasdaemon.c -index 987c544..4f7246f 100644 ---- a/rasdaemon.c -+++ b/rasdaemon.c -@@ -257,6 +257,8 @@ int main(int argc, char *argv[]) - handle_erst_mce(); - #endif - #endif -+ get_boot_time(&boot_time); -+ suspended_time = get_suspended_time(); - - handle_ras_events(args.record_events); - -- -2.33.1 +2.43.5 diff --git a/1085-rasdaemon-disable-ce-offline-default.patch b/1021-anolis-config-disable-page-offline-defalut.patch similarity index 71% rename from 1085-rasdaemon-disable-ce-offline-default.patch rename to 1021-anolis-config-disable-page-offline-defalut.patch index 8ddf0756093e276c4944d2038fe5c6ba84d4277f..880db2207a175d13ecab251b40fa4458d5e085e4 100644 --- a/1085-rasdaemon-disable-ce-offline-default.patch +++ b/1021-anolis-config-disable-page-offline-defalut.patch @@ -1,7 +1,7 @@ -From d246f8ccfdb38fbfcfd4b7405ec4f73237e7521c Mon Sep 17 00:00:00 2001 +From 344b4080d5d093123de8973b74f8289201931483 Mon Sep 17 00:00:00 2001 From: Ruidong Tian -Date: Tue, 24 Dec 2024 17:14:01 +0800 -Subject: [PATCH] anolis: rasdaemon: disable page offline default +Date: Mon, 10 Mar 2025 11:27:45 +0800 +Subject: [PATCH 21/30] anolis: config: disable page offline defalut Signed-off-by: Ruidong Tian --- @@ -9,10 +9,10 @@ Signed-off-by: Ruidong Tian 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index b5b163e..a505b52 100644 +index 2816505..1833f1b 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env -@@ -26,7 +26,7 @@ PAGE_CE_THRESHOLD="50" +@@ -54,7 +54,7 @@ ROW_CE_ACTION="off" # Requires an uptodate kernel. Might not be successfull. # soft-then-hard First try to soft offline, then try hard offlining. # Note: default offline choice is "soft". @@ -22,5 +22,5 @@ index b5b163e..a505b52 100644 # CPU Online Fault Isolation # Whether to enable cpu online fault isolation (yes|no). -- -2.33.1 +2.43.5 diff --git a/1021-rasdaemon-log-non_standard_event-at-just-one-line.patch b/1021-rasdaemon-log-non_standard_event-at-just-one-line.patch deleted file mode 100644 index c841092878d34e1b29a08d56d00029d9b1387639..0000000000000000000000000000000000000000 --- a/1021-rasdaemon-log-non_standard_event-at-just-one-line.patch +++ /dev/null @@ -1,49 +0,0 @@ -From 75223164b507d89c8f90d7ca8e1034ef36d550fb Mon Sep 17 00:00:00 2001 -From: Ruidong Tian -Date: Thu, 7 Sep 2023 18:19:40 +0800 -Subject: [PATCH 21/85] rasdaemon: log non_standard_event at just one line - -It is more reasonable log non_standard_event in one line exclude errors -dump. So you can easily to get decoded non_standard_event log in one -line if you implement a decoder like other event. - -Signed-off-by: Ruidong Tian -Signed-off-by: Mauro Carvalho Chehab ---- - ras-non-standard-handler.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c -index 6932e58..8672b16 100644 ---- a/ras-non-standard-handler.c -+++ b/ras-non-standard-handler.c -@@ -160,7 +160,7 @@ int ras_non_standard_event_handler(struct trace_seq *s, - case GHES_SEV_PANIC: - ev.severity = "Fatal"; - } -- trace_seq_printf(s, "\n %s", ev.severity); -+ trace_seq_printf(s, " %s", ev.severity); - - ev.sec_type = pevent_get_field_raw(s, event, "sec_type", - record, &len, 1); -@@ -171,7 +171,7 @@ int ras_non_standard_event_handler(struct trace_seq *s, - trace_seq_printf(s, "\n section type: %s", - "Ampere Specific Error\n"); - else -- trace_seq_printf(s, "\n section type: %s", -+ trace_seq_printf(s, " section type: %s", - uuid_le(ev.sec_type)); - ev.fru_text = pevent_get_field_raw(s, event, "fru_text", - record, &len, 1); -@@ -184,7 +184,7 @@ int ras_non_standard_event_handler(struct trace_seq *s, - if (pevent_get_field_val(s, event, "len", record, &val, 1) < 0) - return -1; - ev.length = val; -- trace_seq_printf(s, "\n length: %d\n", ev.length); -+ trace_seq_printf(s, " length: %d", ev.length); - - ev.error = pevent_get_field_raw(s, event, "buf", record, &len, 1); - if(!ev.error) --- -2.33.1 - diff --git a/1091-anolis-disable-block-and-dev-error-default.patch b/1022-anolis-disable-block-and-dev-error-default.patch similarity index 49% rename from 1091-anolis-disable-block-and-dev-error-default.patch rename to 1022-anolis-disable-block-and-dev-error-default.patch index f33b1bfe5aa7c576e046a177428320aae6fffc79..83ba86cd54d1979765171b83de7e5a2664843225 100644 --- a/1091-anolis-disable-block-and-dev-error-default.patch +++ b/1022-anolis-disable-block-and-dev-error-default.patch @@ -1,7 +1,7 @@ -From e0e0866270b0db663aff2feecd255a082fe32c0c Mon Sep 17 00:00:00 2001 +From b5d1f625e8cee3697965e975483e523543d38b4b Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Wed, 12 Mar 2025 09:59:55 +0800 -Subject: [PATCH] anolis: disable block and dev error default +Subject: [PATCH 22/30] anolis: disable block and dev error default Signed-off-by: Ruidong Tian --- @@ -9,18 +9,18 @@ Signed-off-by: Ruidong Tian 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index e7c115f..c674136 100644 +index 1833f1b..198b050 100644 --- a/misc/rasdaemon.env +++ b/misc/rasdaemon.env -@@ -46,7 +46,7 @@ CPU_ISOLATION_CYCLE="24h" +@@ -73,7 +73,7 @@ CPU_ISOLATION_CYCLE="24h" + # Prevent excessive isolation from causing an avalanche effect CPU_ISOLATION_LIMIT="10" +-DISABLE="json_report,kmsg_monitor" ++DISABLE="json_report,kmsg_monitor,block:block_rq_complete,devlink:devlink_health_report" --DISABLE="" -+DISABLE="block:block_rq_complete,devlink:devlink_health_report" # Event Trigger - # Event trigger will be executed when the specified event occurs. -- -2.33.1 +2.43.5 diff --git a/1022-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch b/1022-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch deleted file mode 100644 index 153f22959955d7e909b9c8305be025953a2601fd..0000000000000000000000000000000000000000 --- a/1022-rasdaemon-add-support-for-THead-Yitian-non-standard-.patch +++ /dev/null @@ -1,411 +0,0 @@ -From a017a508b00e6d817539e9d9e53533b45e7d1da4 Mon Sep 17 00:00:00 2001 -From: Ruidong Tian -Date: Thu, 7 Sep 2023 18:21:05 +0800 -Subject: [PATCH 22/85] rasdaemon: add support for THead Yitian non-standard - error decoder - -Add a new non-standard error decoder to decode THead YiTian error -section. Put all related code to a new source file. - -Signed-off-by: Ruidong Tian -Signed-off-by: Mauro Carvalho Chehab -[Ruidong: fix conlict with cxl] ---- - Makefile.am | 6 +- - configure.ac | 11 ++ - non-standard-yitian.c | 251 ++++++++++++++++++++++++++++++++++++++++++ - non-standard-yitian.h | 73 ++++++++++++ - 4 files changed, 340 insertions(+), 1 deletion(-) - create mode 100644 non-standard-yitian.c - create mode 100644 non-standard-yitian.h - -diff --git a/Makefile.am b/Makefile.am -index 242ceb7..b16cf34 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -66,6 +66,10 @@ endif - if WITH_CPU_FAULT_ISOLATION - rasdaemon_SOURCES += ras-cpu-isolation.c queue.c - endif -+if WITH_YITIAN_NS_DECODE -+ rasdaemon_SOURCES += non-standard-yitian.c -+endif -+ - rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a - - include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ -@@ -73,7 +77,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ - ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ - non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ -- ras-cpu-isolation.h queue.h -+ ras-cpu-isolation.h queue.h non-standard-yitian.h - - # This rule can't be called with more than one Makefile job (like make -j8) - # I can't figure out a way to fix that -diff --git a/configure.ac b/configure.ac -index d098fcf..135af9c 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -171,6 +171,16 @@ AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "x - AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes]) - AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"]) - -+AC_ARG_ENABLE([yitian_ns_decode], -+ AS_HELP_STRING([--enable-yitian-ns-decode], [enable YITIAN_NS_DECODE events (currently experimental)])) -+ -+AS_IF([test "x$enable_yitian_ns_decode" = "xyes" || test "x$enable_all" == "xyes"], [ -+ AC_DEFINE(HAVE_YITIAN_NS_DECODE,1,"have YITIAN UNKNOWN_SEC events decode") -+ AC_SUBST([WITH_YITIAN_NS_DECODE]) -+]) -+AM_CONDITIONAL([WITH_YITIAN_NS_DECODE], [test x$enable_yitian_ns_decode = xyes || test x$enable_all == xyes]) -+AM_COND_IF([WITH_YITIAN_NS_DECODE], [USE_YITIAN_NS_DECODE="yes"], [USE_YITIAN_NS_DECODE="no"]) -+ - test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc - - CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" -@@ -212,4 +222,5 @@ compile time options summary - Memory CE PFA : $USE_MEMORY_CE_PFA - AMP RAS errors : $USE_AMP_NS_DECODE - CPU fault isolation : $USE_CPU_FAULT_ISOLATION -+ YITIAN RAS errors : $USE_YITIAN_NS_DECODE - EOF -diff --git a/non-standard-yitian.c b/non-standard-yitian.c -new file mode 100644 -index 0000000..99cea47 ---- /dev/null -+++ b/non-standard-yitian.c -@@ -0,0 +1,251 @@ -+/* -+ * Copyright (C) 2023 Alibaba Inc -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ */ -+ -+#include -+#include -+#include -+#include -+#include "ras-record.h" -+#include "ras-logger.h" -+#include "ras-report.h" -+#include "ras-non-standard-handler.h" -+#include "non-standard-yitian.h" -+ -+static const char * const yitian_ddr_payload_err_reg_name[] = { -+ "Error Type:", -+ "Error SubType:", -+ "Error Instance:", -+ "ECCCFG0:", -+ "ECCCFG1:", -+ "ECCSTAT:", -+ "ECCERRCNT:", -+ "ECCCADDR0:", -+ "ECCCADDR1:", -+ "ECCCSYN0:", -+ "ECCCSYN1:", -+ "ECCCSYN2:", -+ "ECCUADDR0:", -+ "ECCUADDR1:", -+ "ECCUSYN0:", -+ "ECCUSYN1:", -+ "ECCUSYN2:", -+ "ECCBITMASK0:", -+ "ECCBITMASK1:", -+ "ECCBITMASK2:", -+ "ADVECCSTAT:", -+ "ECCAPSTAT:", -+ "ECCCDATA0:", -+ "ECCCDATA1:", -+ "ECCUDATA0:", -+ "ECCUDATA1:", -+ "ECCSYMBOL:", -+ "ECCERRCNTCTL:", -+ "ECCERRCNTSTAT:", -+ "ECCERRCNT0:", -+ "ECCERRCNT1:", -+ "RESERVED0:", -+ "RESERVED1:", -+ "RESERVED2:", -+}; -+ -+struct yitian_ras_type_info { -+ int id; -+ const char *name; -+ const char * const *sub; -+ int sub_num; -+}; -+ -+static const struct yitian_ras_type_info yitian_payload_error_type[] = { -+ { -+ .id = YITIAN_RAS_TYPE_DDR, -+ .name = "DDR", -+ }, -+ { -+ } -+}; -+ -+#ifdef HAVE_SQLITE3 -+static const struct db_fields yitian_ddr_payload_fields[] = { -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, -+ { .name = "address", .type = "INTEGER" }, -+ { .name = "regs_dump", .type = "TEXT" }, -+}; -+ -+static const struct db_table_descriptor yitian_ddr_payload_section_tab = { -+ .name = "yitian_ddr_reg_dump_event", -+ .fields = yitian_ddr_payload_fields, -+ .num_fields = ARRAY_SIZE(yitian_ddr_payload_fields), -+}; -+ -+int record_yitian_ddr_reg_dump_event(struct ras_ns_ev_decoder *ev_decoder, -+ struct ras_yitian_ddr_payload_event *ev) -+{ -+ int rc; -+ struct sqlite3_stmt *stmt = ev_decoder->stmt_dec_record; -+ -+ log(TERM, LOG_INFO, "yitian_ddr_reg_dump_event store: %p\n", stmt); -+ -+ sqlite3_bind_text (stmt, 1, ev->timestamp, -1, NULL); -+ sqlite3_bind_int64 (stmt, 2, ev->address); -+ sqlite3_bind_text (stmt, 3, ev->reg_msg, -1, NULL); -+ -+ rc = sqlite3_step(stmt); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed to do yitian_ddr_reg_dump_event step on sqlite: error = %d\n", rc); -+ rc = sqlite3_reset(stmt); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed reset yitian_ddr_reg_dump_event on sqlite: error = %d\n", rc); -+ log(TERM, LOG_INFO, "register inserted at db\n"); -+ -+ return rc; -+} -+#endif -+ -+static const char *oem_type_name(const struct yitian_ras_type_info *info, -+ uint8_t type_id) -+{ -+ const struct yitian_ras_type_info *type = &info[0]; -+ -+ for (; type->name; type++) { -+ if (type->id != type_id) -+ continue; -+ return type->name; -+ } -+ return "unknown"; -+} -+ -+static const char *oem_subtype_name(const struct yitian_ras_type_info *info, -+ uint8_t type_id, uint8_t sub_type_id) -+{ -+ const struct yitian_ras_type_info *type = &info[0]; -+ -+ for (; type->name; type++) { -+ const char * const *submodule = type->sub; -+ -+ if (type->id != type_id) -+ continue; -+ if (type->sub == NULL) -+ return type->name; -+ if (sub_type_id >= type->sub_num) -+ return "unknown"; -+ return submodule[sub_type_id]; -+ } -+ return "unknown"; -+} -+ -+void decode_yitian_ddr_payload_err_regs(struct ras_ns_ev_decoder *ev_decoder, -+ struct trace_seq *s, -+ const struct yitian_ddr_payload_type_sec *err, -+ struct ras_events *ras) -+{ -+ char buf[1024]; -+ char *p = buf; -+ char *end = buf + 1024; -+ int i = 0; -+ const struct yitian_payload_header *header = &err->header; -+ uint32_t *pstart; -+ time_t now; -+ struct tm *tm; -+ struct ras_yitian_ddr_payload_event ev; -+ -+ const char *type_str = oem_type_name(yitian_payload_error_type, -+ header->type); -+ -+ const char *subtype_str = oem_subtype_name(yitian_payload_error_type, -+ header->type, header->subtype); -+ -+#ifdef HAVE_SQLITE3 -+ if (ras->record_events && !ev_decoder->stmt_dec_record) { -+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, -+ &yitian_ddr_payload_section_tab) != SQLITE_OK) { -+ trace_seq_printf(s, "create sql fail\n"); -+ return; -+ } -+ } -+#endif -+ -+ now = time(NULL); -+ tm = localtime(&now); -+ if (tm) -+ strftime(ev.timestamp, sizeof(ev.timestamp), -+ "%Y-%m-%d %H:%M:%S %z", tm); -+ //display error type -+ p += snprintf(p, end - p, " %s", yitian_ddr_payload_err_reg_name[i++]); -+ p += snprintf(p, end - p, " %s,", type_str); -+ -+ //display error subtype -+ p += snprintf(p, end - p, " %s", yitian_ddr_payload_err_reg_name[i++]); -+ p += snprintf(p, end - p, " %s,", subtype_str); -+ -+ //display error instance -+ p += snprintf(p, end - p, " %s", yitian_ddr_payload_err_reg_name[i++]); -+ p += snprintf(p, end - p, " 0x%x,", header->instance); -+ -+ //display reg dump -+ for (pstart = (uint32_t *)&err->ecccfg0; (void *)pstart < (void *)(err + 1); pstart += 1) { -+ p += snprintf(p, end - p, " %s", yitian_ddr_payload_err_reg_name[i++]); -+ p += snprintf(p, end - p, " 0x%x ", *pstart); -+ } -+ -+ if (p > buf && p < end) { -+ p--; -+ *p = '\0'; -+ } -+ -+ ev.reg_msg = malloc(p - buf + 1); -+ memcpy(ev.reg_msg, buf, p - buf + 1); -+ ev.address = 0; -+ -+ i = 0; -+ p = NULL; -+ end = NULL; -+ trace_seq_printf(s, "%s\n", buf); -+ -+#ifdef HAVE_SQLITE3 -+ record_yitian_ddr_reg_dump_event(ev_decoder, &ev); -+#endif -+ -+} -+ -+/* error data decoding functions */ -+static int decode_yitian710_ns_error(struct ras_events *ras, -+ struct ras_ns_ev_decoder *ev_decoder, -+ struct trace_seq *s, -+ struct ras_non_standard_event *event) -+{ -+ int payload_type = event->error[0]; -+ -+ if (payload_type == YITIAN_RAS_TYPE_DDR) { -+ const struct yitian_ddr_payload_type_sec *err = -+ (struct yitian_ddr_payload_type_sec *)event->error; -+ decode_yitian_ddr_payload_err_regs(ev_decoder, s, err, ras); -+ } else { -+ trace_seq_printf(s, "%s: wrong payload type\n", __func__); -+ return -1; -+ } -+ return 0; -+} -+ -+struct ras_ns_ev_decoder yitian_ns_oem_decoder[] = { -+ { -+ .sec_type = "a6980811-16ea-4e4d-b936-fb00a23ff29c", -+ .decode = decode_yitian710_ns_error, -+ }, -+}; -+ -+static void __attribute__((constructor)) yitian_ns_init(void) -+{ -+ int i; -+ for (i = 0; i < ARRAY_SIZE(yitian_ns_oem_decoder); i++) -+ register_ns_ev_decoder(&yitian_ns_oem_decoder[i]); -+} -diff --git a/non-standard-yitian.h b/non-standard-yitian.h -new file mode 100644 -index 0000000..b7d6a2d ---- /dev/null -+++ b/non-standard-yitian.h -@@ -0,0 +1,73 @@ -+/* -+ * Copyright (C) 2023 Alibaba Inc -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ */ -+ -+ -+#ifndef __NON_STANDARD_YITIAN_H -+#define __NON_STANDARD_YITIAN_H -+ -+#include "ras-events.h" -+#include "libtrace/event-parse.h" -+ -+#define YITIAN_RAS_TYPE_DDR 0x50 -+ -+struct yitian_payload_header { -+ uint8_t type; -+ uint8_t subtype; -+ uint16_t instance; -+}; -+ -+struct yitian_ddr_payload_type_sec { -+ struct yitian_payload_header header; -+ uint32_t ecccfg0; -+ uint32_t ecccfg1; -+ uint32_t eccstat; -+ uint32_t eccerrcnt; -+ uint32_t ecccaddr0; -+ uint32_t ecccaddr1; -+ uint32_t ecccsyn0; -+ uint32_t ecccsyn1; -+ uint32_t ecccsyn2; -+ uint32_t eccuaddr0; -+ uint32_t eccuaddr1; -+ uint32_t eccusyn0; -+ uint32_t eccusyn1; -+ uint32_t eccusyn2; -+ uint32_t eccbitmask0; -+ uint32_t eccbitmask1; -+ uint32_t eccbitmask2; -+ uint32_t adveccstat; -+ uint32_t eccapstat; -+ uint32_t ecccdata0; -+ uint32_t ecccdata1; -+ uint32_t eccudata0; -+ uint32_t eccudata1; -+ uint32_t eccsymbol; -+ uint32_t eccerrcntctl; -+ uint32_t eccerrcntstat; -+ uint32_t eccerrcnt0; -+ uint32_t eccerrcnt1; -+ uint32_t reserved0; -+ uint32_t reserved1; -+ uint32_t reserved2; -+}; -+ -+struct ras_yitian_ddr_payload_event { -+ char timestamp[64]; -+ unsigned long long address; -+ char *reg_msg; -+}; -+ -+int record_yitian_ddr_reg_dump_event(struct ras_ns_ev_decoder *ev_decoder, -+ struct ras_yitian_ddr_payload_event *ev); -+void decode_yitian_ddr_payload_err_regs(struct ras_ns_ev_decoder *ev_decoder, -+ struct trace_seq *s, -+ const struct yitian_ddr_payload_type_sec *err, -+ struct ras_events *ras); -+#endif --- -2.33.1 - diff --git a/1023-anolis-add-nvml-in-tree.patch b/1023-anolis-add-nvml-in-tree.patch new file mode 100644 index 0000000000000000000000000000000000000000..9d0718c05f30299dadfa9d37d93ee95acb325f77 --- /dev/null +++ b/1023-anolis-add-nvml-in-tree.patch @@ -0,0 +1,11441 @@ +From 46af414d74baab0e03d716e3af7e77ea3186c47e Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 17 Apr 2025 17:17:55 +0800 +Subject: [PATCH 23/30] anolis: add nvml in tree + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 1 + + contrib/nvml.h | 11370 ++++++++++++++++++++++++++++++++++++++++++++++ + contrib/nvml.py | 13 +- + 3 files changed, 11381 insertions(+), 3 deletions(-) + create mode 100644 contrib/nvml.h + +diff --git a/Makefile.am b/Makefile.am +index 328fa49..4aba962 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -25,6 +25,7 @@ EXTRA_DIST = \ + misc/rasdaemon.env \ + misc/notices \ + contrib/nvml.py \ ++ contrib/nvml.h \ + contrib/*_trigger + + CLEANFILES= \ +diff --git a/contrib/nvml.h b/contrib/nvml.h +new file mode 100644 +index 0000000..937332e +--- /dev/null ++++ b/contrib/nvml.h +@@ -0,0 +1,11370 @@ ++/* ++ * Copyright 1993-2024 NVIDIA Corporation. All rights reserved. ++ * ++ * NOTICE TO USER: ++ * ++ * This source code is subject to NVIDIA ownership rights under U.S. and ++ * international Copyright laws. Users and possessors of this source code ++ * are hereby granted a nonexclusive, royalty-free license to use this code ++ * in individual and commercial software. ++ * ++ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE ++ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR ++ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH ++ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF ++ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. ++ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, ++ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS ++ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE ++ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE ++ * OR PERFORMANCE OF THIS SOURCE CODE. ++ * ++ * U.S. Government End Users. This source code is a "commercial item" as ++ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of ++ * "commercial computer software" and "commercial computer software ++ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) ++ * and is provided to the U.S. Government only as a commercial end item. ++ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through ++ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the ++ * source code with only those rights set forth herein. ++ * ++ * Any use of this source code in individual and commercial software must ++ * include, in the user documentation and internal comments to the code, ++ * the above Disclaimer and U.S. Government End Users Notice. ++ */ ++ ++/* ++NVML API Reference ++ ++The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and ++managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building ++3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi ++tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads. ++ ++API Documentation ++ ++Supported platforms: ++- Windows: Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit ++- Linux: 32-bit and 64-bit ++- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5 ++ ++Supported products: ++- Full Support ++ - All Tesla products, starting with the Fermi architecture ++ - All Quadro products, starting with the Fermi architecture ++ - All vGPU Software products, starting with the Kepler architecture ++ - Selected GeForce Titan products ++- Limited Support ++ - All Geforce products, starting with the Fermi architecture ++ ++The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is ++not be added to the system path by default. To dynamically link to NVML, add this path to the PATH ++environmental variable. To dynamically load NVML, call LoadLibrary with this path. ++ ++On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit ++and 64 bit NVML libraries will be installed. ++ ++Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html ++*/ ++ ++#ifndef __nvml_nvml_h__ ++#define __nvml_nvml_h__ ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * On Windows, set up methods for DLL export ++ * define NVML_STATIC_IMPORT when using nvml_loader library ++ */ ++#if defined _WINDOWS ++ #if !defined NVML_STATIC_IMPORT ++ #if defined NVML_LIB_EXPORT ++ #define DECLDIR __declspec(dllexport) ++ #else ++ #define DECLDIR __declspec(dllimport) ++ #endif ++ #else ++ #define DECLDIR ++ #endif ++#else ++ #define DECLDIR ++#endif ++ ++ #define NVML_MCDM_SUPPORT ++ ++/** ++ * NVML API versioning support ++ */ ++#define NVML_API_VERSION 12 ++#define NVML_API_VERSION_STR "12" ++/** ++ * Defining NVML_NO_UNVERSIONED_FUNC_DEFS will disable "auto upgrading" of APIs. ++ * e.g. the user will have to call nvmlInit_v2 instead of nvmlInit. Enable this ++ * guard if you need to support older versions of the API ++ */ ++#ifndef NVML_NO_UNVERSIONED_FUNC_DEFS ++ #define nvmlInit nvmlInit_v2 ++ #define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3 ++ #define nvmlDeviceGetCount nvmlDeviceGetCount_v2 ++ #define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2 ++ #define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2 ++ #define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2 ++ #define nvmlDeviceRemoveGpu nvmlDeviceRemoveGpu_v2 ++ #define nvmlDeviceGetGridLicensableFeatures nvmlDeviceGetGridLicensableFeatures_v4 ++ #define nvmlEventSetWait nvmlEventSetWait_v2 ++ #define nvmlDeviceGetAttributes nvmlDeviceGetAttributes_v2 ++ #define nvmlComputeInstanceGetInfo nvmlComputeInstanceGetInfo_v2 ++ #define nvmlDeviceGetComputeRunningProcesses nvmlDeviceGetComputeRunningProcesses_v3 ++ #define nvmlDeviceGetGraphicsRunningProcesses nvmlDeviceGetGraphicsRunningProcesses_v3 ++ #define nvmlDeviceGetMPSComputeRunningProcesses nvmlDeviceGetMPSComputeRunningProcesses_v3 ++ #define nvmlBlacklistDeviceInfo_t nvmlExcludedDeviceInfo_t ++ #define nvmlGetBlacklistDeviceCount nvmlGetExcludedDeviceCount ++ #define nvmlGetBlacklistDeviceInfoByIndex nvmlGetExcludedDeviceInfoByIndex ++ #define nvmlDeviceGetGpuInstancePossiblePlacements nvmlDeviceGetGpuInstancePossiblePlacements_v2 ++ #define nvmlVgpuInstanceGetLicenseInfo nvmlVgpuInstanceGetLicenseInfo_v2 ++ #define nvmlDeviceGetDriverModel nvmlDeviceGetDriverModel_v2 ++#endif // #ifndef NVML_NO_UNVERSIONED_FUNC_DEFS ++ ++#define NVML_STRUCT_VERSION(data, ver) (unsigned int)(sizeof(nvml ## data ## _v ## ver ## _t) | \ ++ (ver << 24U)) ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlDeviceStructs Device Structs ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Special constant that some fields take when they are not available. ++ * Used when only part of the struct is not available. ++ * ++ * Each structure explicitly states when to check for this value. ++ */ ++#define NVML_VALUE_NOT_AVAILABLE (-1) ++ ++typedef struct nvmlDevice_st* nvmlDevice_t; ++ ++/** ++ * Buffer size guaranteed to be large enough for pci bus id ++ */ ++#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 32 ++ ++/** ++ * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy ++ */ ++#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16 ++ ++/** ++ * PCI information about a GPU device. ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff ++ unsigned int bus; //!< The bus on which the device resides, 0 to 0xff ++ unsigned int device; //!< The device's id on the bus, 0 to 31 ++ ++ unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id ++ unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID ++ ++ unsigned int baseClass; //!< The 8-bit PCI base class code ++ unsigned int subClass; //!< The 8-bit PCI sub class code ++ ++ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) ++} nvmlPciInfoExt_v1_t; ++typedef nvmlPciInfoExt_v1_t nvmlPciInfoExt_t; ++#define nvmlPciInfoExt_v1 NVML_STRUCT_VERSION(PciInfoExt, 1) ++ ++/** ++ * PCI information about a GPU device. ++ */ ++typedef struct nvmlPciInfo_st ++{ ++ char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (& NULL terminator) ++ unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff ++ unsigned int bus; //!< The bus on which the device resides, 0 to 0xff ++ unsigned int device; //!< The device's id on the bus, 0 to 31 ++ unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id ++ ++ // Added in NVML 2.285 API ++ unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID ++ ++ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) ++} nvmlPciInfo_t; ++ ++/** ++ * PCI format string for ::busIdLegacy ++ */ ++#define NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT "%04X:%02X:%02X.0" ++ ++/** ++ * PCI format string for ::busId ++ */ ++#define NVML_DEVICE_PCI_BUS_ID_FMT "%08X:%02X:%02X.0" ++ ++/** ++ * Utility macro for filling the pci bus id format from a nvmlPciInfo_t ++ */ ++#define NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(pciInfo) (pciInfo)->domain, \ ++ (pciInfo)->bus, \ ++ (pciInfo)->device ++ ++/** ++ * Detailed ECC error counts for a device. ++ * ++ * @deprecated Different GPU families can have different memory error counters ++ * See \ref nvmlDeviceGetMemoryErrorCounter ++ */ ++typedef struct nvmlEccErrorCounts_st ++{ ++ unsigned long long l1Cache; //!< L1 cache errors ++ unsigned long long l2Cache; //!< L2 cache errors ++ unsigned long long deviceMemory; //!< Device memory errors ++ unsigned long long registerFile; //!< Register file errors ++} nvmlEccErrorCounts_t; ++ ++/** ++ * Utilization information for a device. ++ * Each sample period may be between 1 second and 1/6 second, depending on the product being queried. ++ */ ++typedef struct nvmlUtilization_st ++{ ++ unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU ++ unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written ++} nvmlUtilization_t; ++ ++/** ++ * Memory allocation information for a device (v1). ++ * The total amount is equal to the sum of the amounts of free and used memory. ++ */ ++typedef struct nvmlMemory_st ++{ ++ unsigned long long total; //!< Total physical device memory (in bytes) ++ unsigned long long free; //!< Unallocated device memory (in bytes) ++ unsigned long long used; //!< Sum of Reserved and Allocated device memory (in bytes). ++ //!< Note that the driver/GPU always sets aside a small amount of memory for bookkeeping ++} nvmlMemory_t; ++ ++/** ++ * Memory allocation information for a device (v2). ++ * ++ * Version 2 adds versioning for the struct and the amount of system-reserved memory as an output. ++ */ ++typedef struct nvmlMemory_v2_st ++{ ++ unsigned int version; //!< Structure format version (must be 2) ++ unsigned long long total; //!< Total physical device memory (in bytes) ++ unsigned long long reserved; //!< Device memory (in bytes) reserved for system use (driver or firmware) ++ unsigned long long free; //!< Unallocated device memory (in bytes) ++ unsigned long long used; //!< Allocated device memory (in bytes). ++} nvmlMemory_v2_t; ++ ++#define nvmlMemory_v2 NVML_STRUCT_VERSION(Memory, 2) ++ ++/** ++ * BAR1 Memory allocation Information for a device ++ */ ++typedef struct nvmlBAR1Memory_st ++{ ++ unsigned long long bar1Total; //!< Total BAR1 Memory (in bytes) ++ unsigned long long bar1Free; //!< Unallocated BAR1 Memory (in bytes) ++ unsigned long long bar1Used; //!< Allocated Used Memory (in bytes) ++}nvmlBAR1Memory_t; ++ ++/** ++ * Information about running compute processes on the GPU, legacy version ++ * for older versions of the API. ++ */ ++typedef struct nvmlProcessInfo_v1_st ++{ ++ unsigned int pid; //!< Process ID ++ unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. ++ //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported ++ //! because Windows KMD manages all the memory and not the NVIDIA driver ++} nvmlProcessInfo_v1_t; ++ ++/** ++ * Information about running compute processes on the GPU ++ */ ++typedef struct nvmlProcessInfo_v2_st ++{ ++ unsigned int pid; //!< Process ID ++ unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. ++ //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported ++ //! because Windows KMD manages all the memory and not the NVIDIA driver ++ unsigned int gpuInstanceId; //!< If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is set to ++ // 0xFFFFFFFF otherwise. ++ unsigned int computeInstanceId; //!< If MIG is enabled, stores a valid compute instance ID. computeInstanceId is set to ++ // 0xFFFFFFFF otherwise. ++} nvmlProcessInfo_v2_t, nvmlProcessInfo_t; ++ ++/** ++ * Information about running process on the GPU with protected memory ++ */ ++typedef struct ++{ ++ unsigned int pid; //!< Process ID ++ unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. ++ //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported ++ //! because Windows KMD manages all the memory and not the NVIDIA driver ++ unsigned int gpuInstanceId; //!< If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is ++ // set to 0xFFFFFFFF otherwise. ++ unsigned int computeInstanceId; //!< If MIG is enabled, stores a valid compute instance ID. computeInstanceId ++ // is set to 0xFFFFFFFF otherwise. ++ unsigned long long usedGpuCcProtectedMemory; //!< Amount of used GPU conf compute protected memory in bytes. ++} nvmlProcessDetail_v1_t; ++ ++/** ++ * Information about all running processes on the GPU for the given mode ++ */ ++typedef struct ++{ ++ unsigned int version; //!< Struct version, MUST be nvmlProcessDetailList_v1 ++ unsigned int mode; //!< Process mode(Compute/Graphics/MPSCompute) ++ unsigned int numProcArrayEntries; //!< Number of process entries in procArray ++ nvmlProcessDetail_v1_t *procArray; //!< Process array ++} nvmlProcessDetailList_v1_t; ++ ++typedef nvmlProcessDetailList_v1_t nvmlProcessDetailList_t; ++ ++/** ++ * nvmlProcessDetailList version ++ */ ++#define nvmlProcessDetailList_v1 NVML_STRUCT_VERSION(ProcessDetailList, 1) ++ ++typedef struct nvmlDeviceAttributes_st ++{ ++ unsigned int multiprocessorCount; //!< Streaming Multiprocessor count ++ unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count ++ unsigned int sharedDecoderCount; //!< Shared Decoder Engine count ++ unsigned int sharedEncoderCount; //!< Shared Encoder Engine count ++ unsigned int sharedJpegCount; //!< Shared JPEG Engine count ++ unsigned int sharedOfaCount; //!< Shared OFA Engine count ++ unsigned int gpuInstanceSliceCount; //!< GPU instance slice count ++ unsigned int computeInstanceSliceCount; //!< Compute instance slice count ++ unsigned long long memorySizeMB; //!< Device memory size (in MiB) ++} nvmlDeviceAttributes_t; ++ ++/** ++ * C2C Mode information for a device ++ */ ++typedef struct ++{ ++ unsigned int isC2cEnabled; ++} nvmlC2cModeInfo_v1_t; ++ ++#define nvmlC2cModeInfo_v1 NVML_STRUCT_VERSION(C2cModeInfo, 1) ++ ++/** ++ * Possible values that classify the remap availability for each bank. The max ++ * field will contain the number of banks that have maximum remap availability ++ * (all reserved rows are available). None means that there are no reserved ++ * rows available. ++ */ ++typedef struct nvmlRowRemapperHistogramValues_st ++{ ++ unsigned int max; ++ unsigned int high; ++ unsigned int partial; ++ unsigned int low; ++ unsigned int none; ++} nvmlRowRemapperHistogramValues_t; ++ ++/** ++ * Enum to represent type of bridge chip ++ */ ++typedef enum nvmlBridgeChipType_enum ++{ ++ NVML_BRIDGE_CHIP_PLX = 0, ++ NVML_BRIDGE_CHIP_BRO4 = 1 ++}nvmlBridgeChipType_t; ++ ++/** ++ * Maximum number of NvLink links supported ++ */ ++#define NVML_NVLINK_MAX_LINKS 18 ++ ++/** ++ * Enum to represent the NvLink utilization counter packet units ++ */ ++typedef enum nvmlNvLinkUtilizationCountUnits_enum ++{ ++ NVML_NVLINK_COUNTER_UNIT_CYCLES = 0, // count by cycles ++ NVML_NVLINK_COUNTER_UNIT_PACKETS = 1, // count by packets ++ NVML_NVLINK_COUNTER_UNIT_BYTES = 2, // count by bytes ++ NVML_NVLINK_COUNTER_UNIT_RESERVED = 3, // count reserved for internal use ++ // this must be last ++ NVML_NVLINK_COUNTER_UNIT_COUNT ++} nvmlNvLinkUtilizationCountUnits_t; ++ ++/** ++ * Enum to represent the NvLink utilization counter packet types to count ++ * ** this is ONLY applicable with the units as packets or bytes ++ * ** as specified in \a nvmlNvLinkUtilizationCountUnits_t ++ * ** all packet filter descriptions are target GPU centric ++ * ** these can be "OR'd" together ++ */ ++typedef enum nvmlNvLinkUtilizationCountPktTypes_enum ++{ ++ NVML_NVLINK_COUNTER_PKTFILTER_NOP = 0x1, // no operation packets ++ NVML_NVLINK_COUNTER_PKTFILTER_READ = 0x2, // read packets ++ NVML_NVLINK_COUNTER_PKTFILTER_WRITE = 0x4, // write packets ++ NVML_NVLINK_COUNTER_PKTFILTER_RATOM = 0x8, // reduction atomic requests ++ NVML_NVLINK_COUNTER_PKTFILTER_NRATOM = 0x10, // non-reduction atomic requests ++ NVML_NVLINK_COUNTER_PKTFILTER_FLUSH = 0x20, // flush requests ++ NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA = 0x40, // responses with data ++ NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80, // responses without data ++ NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF // all packets ++} nvmlNvLinkUtilizationCountPktTypes_t; ++ ++/** ++ * Struct to define the NVLINK counter controls ++ */ ++typedef struct nvmlNvLinkUtilizationControl_st ++{ ++ nvmlNvLinkUtilizationCountUnits_t units; ++ nvmlNvLinkUtilizationCountPktTypes_t pktfilter; ++} nvmlNvLinkUtilizationControl_t; ++ ++/** ++ * Enum to represent NvLink queryable capabilities ++ */ ++typedef enum nvmlNvLinkCapability_enum ++{ ++ NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported ++ NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported ++ NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported ++ NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported ++ NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link ++ NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device ++ // should be last ++ NVML_NVLINK_CAP_COUNT ++} nvmlNvLinkCapability_t; ++ ++/** ++ * Enum to represent NvLink queryable error counters ++ */ ++typedef enum nvmlNvLinkErrorCounter_enum ++{ ++ NVML_NVLINK_ERROR_DL_REPLAY = 0, // Data link transmit replay error counter ++ NVML_NVLINK_ERROR_DL_RECOVERY = 1, // Data link transmit recovery error counter ++ NVML_NVLINK_ERROR_DL_CRC_FLIT = 2, // Data link receive flow control digit CRC error counter ++ NVML_NVLINK_ERROR_DL_CRC_DATA = 3, // Data link receive data CRC error counter ++ NVML_NVLINK_ERROR_DL_ECC_DATA = 4, // Data link receive data ECC error counter ++ ++ // this must be last ++ NVML_NVLINK_ERROR_COUNT ++} nvmlNvLinkErrorCounter_t; ++ ++/** ++ * Enum to represent NvLink's remote device type ++ */ ++typedef enum nvmlIntNvLinkDeviceType_enum ++{ ++ NVML_NVLINK_DEVICE_TYPE_GPU = 0x00, ++ NVML_NVLINK_DEVICE_TYPE_IBMNPU = 0x01, ++ NVML_NVLINK_DEVICE_TYPE_SWITCH = 0x02, ++ NVML_NVLINK_DEVICE_TYPE_UNKNOWN = 0xFF ++} nvmlIntNvLinkDeviceType_t; ++ ++/** ++ * Represents level relationships within a system between two GPUs ++ * The enums are spaced to allow for future relationships ++ */ ++typedef enum nvmlGpuLevel_enum ++{ ++ NVML_TOPOLOGY_INTERNAL = 0, // e.g. Tesla K80 ++ NVML_TOPOLOGY_SINGLE = 10, // all devices that only need traverse a single PCIe switch ++ NVML_TOPOLOGY_MULTIPLE = 20, // all devices that need not traverse a host bridge ++ NVML_TOPOLOGY_HOSTBRIDGE = 30, // all devices that are connected to the same host bridge ++ NVML_TOPOLOGY_NODE = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges ++ NVML_TOPOLOGY_SYSTEM = 50 // all devices in the system ++ ++ // there is purposefully no COUNT here because of the need for spacing above ++} nvmlGpuTopologyLevel_t; ++ ++/* Compatibility for CPU->NODE renaming */ ++#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE ++ ++/* P2P Capability Index Status*/ ++typedef enum nvmlGpuP2PStatus_enum ++{ ++ NVML_P2P_STATUS_OK = 0, ++ NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, ++ NVML_P2P_STATUS_CHIPSET_NOT_SUPPORTED = NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, ++ NVML_P2P_STATUS_GPU_NOT_SUPPORTED, ++ NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, ++ NVML_P2P_STATUS_DISABLED_BY_REGKEY, ++ NVML_P2P_STATUS_NOT_SUPPORTED, ++ NVML_P2P_STATUS_UNKNOWN ++ ++} nvmlGpuP2PStatus_t; ++ ++/* P2P Capability Index*/ ++typedef enum nvmlGpuP2PCapsIndex_enum ++{ ++ NVML_P2P_CAPS_INDEX_READ = 0, ++ NVML_P2P_CAPS_INDEX_WRITE = 1, ++ NVML_P2P_CAPS_INDEX_NVLINK = 2, ++ NVML_P2P_CAPS_INDEX_ATOMICS = 3, ++ NVML_P2P_CAPS_INDEX_PCI = 4, ++ /* ++ * DO NOT USE! NVML_P2P_CAPS_INDEX_PROP is deprecated. ++ * Use NVML_P2P_CAPS_INDEX_PCI instead. ++ */ ++ NVML_P2P_CAPS_INDEX_PROP = NVML_P2P_CAPS_INDEX_PCI, ++ NVML_P2P_CAPS_INDEX_UNKNOWN = 5, ++}nvmlGpuP2PCapsIndex_t; ++ ++/** ++ * Maximum limit on Physical Bridges per Board ++ */ ++#define NVML_MAX_PHYSICAL_BRIDGE (128) ++ ++/** ++ * Information about the Bridge Chip Firmware ++ */ ++typedef struct nvmlBridgeChipInfo_st ++{ ++ nvmlBridgeChipType_t type; //!< Type of Bridge Chip ++ unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable ++}nvmlBridgeChipInfo_t; ++ ++/** ++ * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate ++ * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth. ++ */ ++typedef struct nvmlBridgeChipHierarchy_st ++{ ++ unsigned char bridgeCount; //!< Number of Bridge Chips on the Board ++ nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board ++}nvmlBridgeChipHierarchy_t; ++ ++/** ++ * Represents Type of Sampling Event ++ */ ++typedef enum nvmlSamplingType_enum ++{ ++ NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU ++ NVML_GPU_UTILIZATION_SAMPLES = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU ++ NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written ++ NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy ++ NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy ++ NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples ++ NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples ++ NVML_MODULE_POWER_SAMPLES = 7, //!< To represent module power samples for total module starting Grace Hopper ++ NVML_JPG_UTILIZATION_SAMPLES = 8, //!< To represent percent of time during which NVJPG remains busy ++ NVML_OFA_UTILIZATION_SAMPLES = 9, //!< To represent percent of time during which NVOFA remains busy ++ ++ // Keep this last ++ NVML_SAMPLINGTYPE_COUNT ++}nvmlSamplingType_t; ++ ++/** ++ * Represents the queryable PCIe utilization counters ++ */ ++typedef enum nvmlPcieUtilCounter_enum ++{ ++ NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity ++ NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity ++ ++ // Keep this last ++ NVML_PCIE_UTIL_COUNT ++} nvmlPcieUtilCounter_t; ++ ++/** ++ * Represents the type for sample value returned ++ */ ++typedef enum nvmlValueType_enum ++{ ++ NVML_VALUE_TYPE_DOUBLE = 0, ++ NVML_VALUE_TYPE_UNSIGNED_INT = 1, ++ NVML_VALUE_TYPE_UNSIGNED_LONG = 2, ++ NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, ++ NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, ++ NVML_VALUE_TYPE_SIGNED_INT = 5, ++ NVML_VALUE_TYPE_UNSIGNED_SHORT = 6, ++ ++ // Keep this last ++ NVML_VALUE_TYPE_COUNT ++}nvmlValueType_t; ++ ++ ++/** ++ * Union to represent different types of Value ++ */ ++typedef union nvmlValue_st ++{ ++ double dVal; //!< If the value is double ++ int siVal; //!< If the value is signed int ++ unsigned int uiVal; //!< If the value is unsigned int ++ unsigned long ulVal; //!< If the value is unsigned long ++ unsigned long long ullVal; //!< If the value is unsigned long long ++ signed long long sllVal; //!< If the value is signed long long ++ unsigned short usVal; //!< If the value is unsigned short ++}nvmlValue_t; ++ ++/** ++ * Information for Sample ++ */ ++typedef struct nvmlSample_st ++{ ++ unsigned long long timeStamp; //!< CPU Timestamp in microseconds ++ nvmlValue_t sampleValue; //!< Sample Value ++}nvmlSample_t; ++ ++/** ++ * Represents type of perf policy for which violation times can be queried ++ */ ++typedef enum nvmlPerfPolicyType_enum ++{ ++ NVML_PERF_POLICY_POWER = 0, //!< How long did power violations cause the GPU to be below application clocks ++ NVML_PERF_POLICY_THERMAL = 1, //!< How long did thermal violations cause the GPU to be below application clocks ++ NVML_PERF_POLICY_SYNC_BOOST = 2, //!< How long did sync boost cause the GPU to be below application clocks ++ NVML_PERF_POLICY_BOARD_LIMIT = 3, //!< How long did the board limit cause the GPU to be below application clocks ++ NVML_PERF_POLICY_LOW_UTILIZATION = 4, //!< How long did low utilization cause the GPU to be below application clocks ++ NVML_PERF_POLICY_RELIABILITY = 5, //!< How long did the board reliability limit cause the GPU to be below application clocks ++ ++ NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10, //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above) ++ NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks ++ ++ // Keep this last ++ NVML_PERF_POLICY_COUNT ++}nvmlPerfPolicyType_t; ++ ++/** ++ * Struct to hold perf policy violation status data ++ */ ++typedef struct nvmlViolationTime_st ++{ ++ unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds ++ unsigned long long violationTime; //!< violationTime in Nanoseconds ++}nvmlViolationTime_t; ++ ++#define NVML_MAX_THERMAL_SENSORS_PER_GPU 3 ++ ++/** ++ * Represents the thermal sensor targets ++ */ ++typedef enum ++{ ++ NVML_THERMAL_TARGET_NONE = 0, ++ NVML_THERMAL_TARGET_GPU = 1, //!< GPU core temperature requires NvPhysicalGpuHandle ++ NVML_THERMAL_TARGET_MEMORY = 2, //!< GPU memory temperature requires NvPhysicalGpuHandle ++ NVML_THERMAL_TARGET_POWER_SUPPLY = 4, //!< GPU power supply temperature requires NvPhysicalGpuHandle ++ NVML_THERMAL_TARGET_BOARD = 8, //!< GPU board ambient temperature requires NvPhysicalGpuHandle ++ NVML_THERMAL_TARGET_VCD_BOARD = 9, //!< Visual Computing Device Board temperature requires NvVisualComputingDeviceHandle ++ NVML_THERMAL_TARGET_VCD_INLET = 10, //!< Visual Computing Device Inlet temperature requires NvVisualComputingDeviceHandle ++ NVML_THERMAL_TARGET_VCD_OUTLET = 11, //!< Visual Computing Device Outlet temperature requires NvVisualComputingDeviceHandle ++ ++ NVML_THERMAL_TARGET_ALL = 15, ++ NVML_THERMAL_TARGET_UNKNOWN = -1, ++} nvmlThermalTarget_t; ++ ++/** ++ * Represents the thermal sensor controllers ++ */ ++typedef enum ++{ ++ NVML_THERMAL_CONTROLLER_NONE = 0, ++ NVML_THERMAL_CONTROLLER_GPU_INTERNAL, ++ NVML_THERMAL_CONTROLLER_ADM1032, ++ NVML_THERMAL_CONTROLLER_ADT7461, ++ NVML_THERMAL_CONTROLLER_MAX6649, ++ NVML_THERMAL_CONTROLLER_MAX1617, ++ NVML_THERMAL_CONTROLLER_LM99, ++ NVML_THERMAL_CONTROLLER_LM89, ++ NVML_THERMAL_CONTROLLER_LM64, ++ NVML_THERMAL_CONTROLLER_G781, ++ NVML_THERMAL_CONTROLLER_ADT7473, ++ NVML_THERMAL_CONTROLLER_SBMAX6649, ++ NVML_THERMAL_CONTROLLER_VBIOSEVT, ++ NVML_THERMAL_CONTROLLER_OS, ++ NVML_THERMAL_CONTROLLER_NVSYSCON_CANOAS, ++ NVML_THERMAL_CONTROLLER_NVSYSCON_E551, ++ NVML_THERMAL_CONTROLLER_MAX6649R, ++ NVML_THERMAL_CONTROLLER_ADT7473S, ++ NVML_THERMAL_CONTROLLER_UNKNOWN = -1, ++} nvmlThermalController_t; ++ ++/** ++ * Struct to hold the thermal sensor settings ++ */ ++typedef struct ++{ ++ unsigned int count; ++ struct ++ { ++ nvmlThermalController_t controller; ++ int defaultMinTemp; ++ int defaultMaxTemp; ++ int currentTemp; ++ nvmlThermalTarget_t target; ++ } sensor[NVML_MAX_THERMAL_SENSORS_PER_GPU]; ++ ++} nvmlGpuThermalSettings_t; ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlDeviceEnums Device Enums ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Generic enable/disable enum. ++ */ ++typedef enum nvmlEnableState_enum ++{ ++ NVML_FEATURE_DISABLED = 0, //!< Feature disabled ++ NVML_FEATURE_ENABLED = 1 //!< Feature enabled ++} nvmlEnableState_t; ++ ++//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details. ++#define nvmlFlagDefault 0x00 ++//! Generic flag used to force some behavior. See description of particular functions for details. ++#define nvmlFlagForce 0x01 ++ ++/** ++ * * The Brand of the GPU ++ * */ ++typedef enum nvmlBrandType_enum ++{ ++ NVML_BRAND_UNKNOWN = 0, ++ NVML_BRAND_QUADRO = 1, ++ NVML_BRAND_TESLA = 2, ++ NVML_BRAND_NVS = 3, ++ NVML_BRAND_GRID = 4, // Deprecated from API reporting. Keeping definition for backward compatibility. ++ NVML_BRAND_GEFORCE = 5, ++ NVML_BRAND_TITAN = 6, ++ NVML_BRAND_NVIDIA_VAPPS = 7, // NVIDIA Virtual Applications ++ NVML_BRAND_NVIDIA_VPC = 8, // NVIDIA Virtual PC ++ NVML_BRAND_NVIDIA_VCS = 9, // NVIDIA Virtual Compute Server ++ NVML_BRAND_NVIDIA_VWS = 10, // NVIDIA RTX Virtual Workstation ++ NVML_BRAND_NVIDIA_CLOUD_GAMING = 11, // NVIDIA Cloud Gaming ++ NVML_BRAND_NVIDIA_VGAMING = NVML_BRAND_NVIDIA_CLOUD_GAMING, // Deprecated from API reporting. Keeping definition for backward compatibility. ++ NVML_BRAND_QUADRO_RTX = 12, ++ NVML_BRAND_NVIDIA_RTX = 13, ++ NVML_BRAND_NVIDIA = 14, ++ NVML_BRAND_GEFORCE_RTX = 15, // Unused ++ NVML_BRAND_TITAN_RTX = 16, // Unused ++ ++ // Keep this last ++ NVML_BRAND_COUNT ++} nvmlBrandType_t; ++ ++/** ++ * Temperature thresholds. ++ */ ++typedef enum nvmlTemperatureThresholds_enum ++{ ++ NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will ++ // shut down for HW protection ++ NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will ++ // begin HW slowdown ++ NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2, // Memory Temperature at which the GPU will ++ // begin SW slowdown ++ NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3, // GPU Temperature at which the GPU ++ // can be throttled below base clock ++ NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MIN = 4, // Minimum GPU Temperature that can be ++ // set as acoustic threshold ++ NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_CURR = 5, // Current temperature that is set as ++ // acoustic threshold. ++ NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MAX = 6, // Maximum GPU temperature that can be ++ // set as acoustic threshold. ++ NVML_TEMPERATURE_THRESHOLD_GPS_CURR = 7, // Current temperature that is set as ++ // gps threshold. ++ // Keep this last ++ NVML_TEMPERATURE_THRESHOLD_COUNT ++} nvmlTemperatureThresholds_t; ++ ++/** ++ * Temperature sensors. ++ */ ++typedef enum nvmlTemperatureSensors_enum ++{ ++ NVML_TEMPERATURE_GPU = 0, //!< Temperature sensor for the GPU die ++ ++ // Keep this last ++ NVML_TEMPERATURE_COUNT ++} nvmlTemperatureSensors_t; ++ ++/** ++ * Compute mode. ++ * ++ * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0. ++ * Earlier CUDA versions supported a single exclusive mode, ++ * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond. ++ */ ++typedef enum nvmlComputeMode_enum ++{ ++ NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device ++ NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed ++ NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device ++ NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time ++ ++ // Keep this last ++ NVML_COMPUTEMODE_COUNT ++} nvmlComputeMode_t; ++ ++/** ++ * Max Clock Monitors available ++ */ ++#define MAX_CLK_DOMAINS 32 ++ ++/** ++ * Clock Monitor error types ++ */ ++typedef struct nvmlClkMonFaultInfo_struct { ++ /** ++ * The Domain which faulted ++ */ ++ unsigned int clkApiDomain; ++ ++ /** ++ * Faults Information ++ */ ++ unsigned int clkDomainFaultMask; ++} nvmlClkMonFaultInfo_t; ++ ++/** ++ * Clock Monitor Status ++ */ ++typedef struct nvmlClkMonStatus_status { ++ /** ++ * Fault status Indicator ++ */ ++ unsigned int bGlobalStatus; ++ ++ /** ++ * Total faulted domain numbers ++ */ ++ unsigned int clkMonListSize; ++ ++ /** ++ * The fault Information structure ++ */ ++ nvmlClkMonFaultInfo_t clkMonList[MAX_CLK_DOMAINS]; ++} nvmlClkMonStatus_t; ++ ++/** ++ * ECC bit types. ++ * ++ * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type ++ */ ++#define nvmlEccBitType_t nvmlMemoryErrorType_t ++ ++/** ++ * Single bit ECC errors ++ * ++ * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED ++ */ ++#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED ++ ++/** ++ * Double bit ECC errors ++ * ++ * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED ++ */ ++#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED ++ ++/** ++ * Memory error types ++ */ ++typedef enum nvmlMemoryErrorType_enum ++{ ++ /** ++ * A memory error that was corrected ++ * ++ * For ECC errors, these are single bit errors ++ * For Texture memory, these are errors fixed by resend ++ */ ++ NVML_MEMORY_ERROR_TYPE_CORRECTED = 0, ++ /** ++ * A memory error that was not corrected ++ * ++ * For ECC errors, these are double bit errors ++ * For Texture memory, these are errors where the resend fails ++ */ ++ NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1, ++ ++ ++ // Keep this last ++ NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types ++ ++} nvmlMemoryErrorType_t; ++ ++/** ++ * ECC counter types. ++ * ++ * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent. ++ * On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver ++ * client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app ++ * is run. ++ */ ++typedef enum nvmlEccCounterType_enum ++{ ++ NVML_VOLATILE_ECC = 0, //!< Volatile counts are reset each time the driver loads. ++ NVML_AGGREGATE_ECC = 1, //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device) ++ ++ // Keep this last ++ NVML_ECC_COUNTER_TYPE_COUNT //!< Count of memory counter types ++} nvmlEccCounterType_t; ++ ++/** ++ * Clock types. ++ * ++ * All speeds are in Mhz. ++ */ ++typedef enum nvmlClockType_enum ++{ ++ NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain ++ NVML_CLOCK_SM = 1, //!< SM clock domain ++ NVML_CLOCK_MEM = 2, //!< Memory clock domain ++ NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain ++ ++ // Keep this last ++ NVML_CLOCK_COUNT //!< Count of clock types ++} nvmlClockType_t; ++ ++/** ++ * Clock Ids. These are used in combination with nvmlClockType_t ++ * to specify a single clock value. ++ */ ++typedef enum nvmlClockId_enum ++{ ++ NVML_CLOCK_ID_CURRENT = 0, //!< Current actual clock value ++ NVML_CLOCK_ID_APP_CLOCK_TARGET = 1, //!< Target application clock ++ NVML_CLOCK_ID_APP_CLOCK_DEFAULT = 2, //!< Default application clock target ++ NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3, //!< OEM-defined maximum clock rate ++ ++ //Keep this last ++ NVML_CLOCK_ID_COUNT //!< Count of Clock Ids. ++} nvmlClockId_t; ++ ++/** ++ * Driver models. ++ * ++ * Windows only. ++ */ ++ ++typedef enum nvmlDriverModel_enum ++{ ++ NVML_DRIVER_WDDM = 0, //!< WDDM driver model -- GPU treated as a display device ++ NVML_DRIVER_WDM = 1, //!< WDM (TCC) model (deprecated) -- GPU treated as a generic compute device ++ NVML_DRIVER_MCDM = 2 //!< MCDM driver model -- GPU treated as a Microsoft compute device ++} nvmlDriverModel_t; ++ ++#define NVML_MAX_GPU_PERF_PSTATES 16 ++ ++/** ++ * Allowed PStates. ++ */ ++typedef enum nvmlPStates_enum ++{ ++ NVML_PSTATE_0 = 0, //!< Performance state 0 -- Maximum Performance ++ NVML_PSTATE_1 = 1, //!< Performance state 1 ++ NVML_PSTATE_2 = 2, //!< Performance state 2 ++ NVML_PSTATE_3 = 3, //!< Performance state 3 ++ NVML_PSTATE_4 = 4, //!< Performance state 4 ++ NVML_PSTATE_5 = 5, //!< Performance state 5 ++ NVML_PSTATE_6 = 6, //!< Performance state 6 ++ NVML_PSTATE_7 = 7, //!< Performance state 7 ++ NVML_PSTATE_8 = 8, //!< Performance state 8 ++ NVML_PSTATE_9 = 9, //!< Performance state 9 ++ NVML_PSTATE_10 = 10, //!< Performance state 10 ++ NVML_PSTATE_11 = 11, //!< Performance state 11 ++ NVML_PSTATE_12 = 12, //!< Performance state 12 ++ NVML_PSTATE_13 = 13, //!< Performance state 13 ++ NVML_PSTATE_14 = 14, //!< Performance state 14 ++ NVML_PSTATE_15 = 15, //!< Performance state 15 -- Minimum Performance ++ NVML_PSTATE_UNKNOWN = 32 //!< Unknown performance state ++} nvmlPstates_t; ++ ++/** ++ * Clock offset info. ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ nvmlClockType_t type; ++ nvmlPstates_t pstate; ++ int clockOffsetMHz; ++ int minClockOffsetMHz; ++ int maxClockOffsetMHz; ++} nvmlClockOffset_v1_t; ++ ++typedef nvmlClockOffset_v1_t nvmlClockOffset_t; ++ ++#define nvmlClockOffset_v1 NVML_STRUCT_VERSION(ClockOffset, 1) ++ ++/** ++ * GPU Operation Mode ++ * ++ * GOM allows to reduce power usage and optimize GPU throughput by disabling GPU features. ++ * ++ * Each GOM is designed to meet specific user needs. ++ */ ++typedef enum nvmlGom_enum ++{ ++ NVML_GOM_ALL_ON = 0, //!< Everything is enabled and running at full speed ++ ++ NVML_GOM_COMPUTE = 1, //!< Designed for running only compute tasks. Graphics operations ++ //!< are not allowed ++ ++ NVML_GOM_LOW_DP = 2 //!< Designed for running graphics applications that don't require ++ //!< high bandwidth double precision ++} nvmlGpuOperationMode_t; ++ ++/** ++ * Available infoROM objects. ++ */ ++typedef enum nvmlInforomObject_enum ++{ ++ NVML_INFOROM_OEM = 0, //!< An object defined by OEM ++ NVML_INFOROM_ECC = 1, //!< The ECC object determining the level of ECC support ++ NVML_INFOROM_POWER = 2, //!< The power management object ++ ++ // Keep this last ++ NVML_INFOROM_COUNT //!< This counts the number of infoROM objects the driver knows about ++} nvmlInforomObject_t; ++ ++/** ++ * Return values for NVML API calls. ++ */ ++typedef enum nvmlReturn_enum ++{ ++ // cppcheck-suppress * ++ NVML_SUCCESS = 0, //!< The operation was successful ++ NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit() ++ NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid ++ NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device ++ NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation ++ NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting ++ NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful ++ NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough ++ NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached ++ NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded ++ NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed ++ NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU ++ NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded ++ NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function ++ NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted ++ NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible ++ NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again ++ NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups ++ NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch ++ NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use ++ NVML_ERROR_MEMORY = 20, //!< Insufficient memory ++ NVML_ERROR_NO_DATA = 21, //!< No data ++ NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22, //!< The requested vgpu operation is not available on target device, becasue ECC is enabled ++ NVML_ERROR_INSUFFICIENT_RESOURCES = 23, //!< Ran out of critical resources, other than memory ++ NVML_ERROR_FREQ_NOT_SUPPORTED = 24, //!< Ran out of critical resources, other than memory ++ NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25, //!< The provided version is invalid/unsupported ++ NVML_ERROR_DEPRECATED = 26, //!< The requested functionality has been deprecated ++ NVML_ERROR_NOT_READY = 27, //!< The system is not ready for the request ++ NVML_ERROR_GPU_NOT_FOUND = 28, //!< No GPUs were found ++ NVML_ERROR_INVALID_STATE = 29, //!< Resource not in correct state to perform requested operation ++ NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred ++} nvmlReturn_t; ++ ++/** ++ * See \ref nvmlDeviceGetMemoryErrorCounter ++ */ ++typedef enum nvmlMemoryLocation_enum ++{ ++ NVML_MEMORY_LOCATION_L1_CACHE = 0, //!< GPU L1 Cache ++ NVML_MEMORY_LOCATION_L2_CACHE = 1, //!< GPU L2 Cache ++ NVML_MEMORY_LOCATION_DRAM = 2, //!< Turing+ DRAM ++ NVML_MEMORY_LOCATION_DEVICE_MEMORY = 2, //!< GPU Device Memory ++ NVML_MEMORY_LOCATION_REGISTER_FILE = 3, //!< GPU Register File ++ NVML_MEMORY_LOCATION_TEXTURE_MEMORY = 4, //!< GPU Texture Memory ++ NVML_MEMORY_LOCATION_TEXTURE_SHM = 5, //!< Shared memory ++ NVML_MEMORY_LOCATION_CBU = 6, //!< CBU ++ NVML_MEMORY_LOCATION_SRAM = 7, //!< Turing+ SRAM ++ // Keep this last ++ NVML_MEMORY_LOCATION_COUNT //!< This counts the number of memory locations the driver knows about ++} nvmlMemoryLocation_t; ++ ++/** ++ * Causes for page retirement ++ */ ++typedef enum nvmlPageRetirementCause_enum ++{ ++ NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0, //!< Page was retired due to multiple single bit ECC error ++ NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR = 1, //!< Page was retired due to double bit ECC error ++ ++ // Keep this last ++ NVML_PAGE_RETIREMENT_CAUSE_COUNT ++} nvmlPageRetirementCause_t; ++ ++/** ++ * API types that allow changes to default permission restrictions ++ */ ++typedef enum nvmlRestrictedAPI_enum ++{ ++ NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0, //!< APIs that change application clocks, see nvmlDeviceSetApplicationsClocks ++ //!< and see nvmlDeviceResetApplicationsClocks ++ NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1, //!< APIs that enable/disable Auto Boosted clocks ++ //!< see nvmlDeviceSetAutoBoostedClocksEnabled ++ // Keep this last ++ NVML_RESTRICTED_API_COUNT ++} nvmlRestrictedAPI_t; ++ ++/** ++ * Structure to store utilization value and process Id ++ */ ++typedef struct nvmlProcessUtilizationSample_st ++{ ++ unsigned int pid; //!< PID of process ++ unsigned long long timeStamp; //!< CPU Timestamp in microseconds ++ unsigned int smUtil; //!< SM (3D/Compute) Util Value ++ unsigned int memUtil; //!< Frame Buffer Memory Util Value ++ unsigned int encUtil; //!< Encoder Util Value ++ unsigned int decUtil; //!< Decoder Util Value ++} nvmlProcessUtilizationSample_t; ++ ++/** ++ * Structure to store utilization value and process Id -- version 1 ++ */ ++typedef struct ++{ ++ unsigned long long timeStamp; //!< CPU Timestamp in microseconds ++ unsigned int pid; //!< PID of process ++ unsigned int smUtil; //!< SM (3D/Compute) Util Value ++ unsigned int memUtil; //!< Frame Buffer Memory Util Value ++ unsigned int encUtil; //!< Encoder Util Value ++ unsigned int decUtil; //!< Decoder Util Value ++ unsigned int jpgUtil; //!< Jpeg Util Value ++ unsigned int ofaUtil; //!< Ofa Util Value ++} nvmlProcessUtilizationInfo_v1_t; ++ ++/** ++ * Structure to store utilization and process ID for each running process -- version 1 ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ unsigned int processSamplesCount; //!< Caller-supplied array size, and returns number of processes running ++ unsigned long long lastSeenTimeStamp; //!< Return only samples with timestamp greater than lastSeenTimeStamp ++ nvmlProcessUtilizationInfo_v1_t *procUtilArray; //!< The array (allocated by caller) of the utilization of GPU SM, framebuffer, video encoder, video decoder, JPEG, and OFA ++} nvmlProcessesUtilizationInfo_v1_t; ++typedef nvmlProcessesUtilizationInfo_v1_t nvmlProcessesUtilizationInfo_t; ++#define nvmlProcessesUtilizationInfo_v1 NVML_STRUCT_VERSION(ProcessesUtilizationInfo, 1) ++ ++/** ++ * Structure to store SRAM uncorrectable error counters ++ */ ++typedef struct ++{ ++ unsigned int version; //!< the API version number ++ unsigned long long aggregateUncParity; //!< aggregate uncorrectable parity error count ++ unsigned long long aggregateUncSecDed; //!< aggregate uncorrectable SEC-DED error count ++ unsigned long long aggregateCor; //!< aggregate correctable error count ++ unsigned long long volatileUncParity; //!< volatile uncorrectable parity error count ++ unsigned long long volatileUncSecDed; //!< volatile uncorrectable SEC-DED error count ++ unsigned long long volatileCor; //!< volatile correctable error count ++ unsigned long long aggregateUncBucketL2; //!< aggregate uncorrectable error count for L2 cache bucket ++ unsigned long long aggregateUncBucketSm; //!< aggregate uncorrectable error count for SM bucket ++ unsigned long long aggregateUncBucketPcie; //!< aggregate uncorrectable error count for PCIE bucket ++ unsigned long long aggregateUncBucketMcu; //!< aggregate uncorrectable error count for Microcontroller bucket ++ unsigned long long aggregateUncBucketOther; //!< aggregate uncorrectable error count for Other bucket ++ unsigned int bThresholdExceeded; //!< if the error threshold of field diag is exceeded ++} nvmlEccSramErrorStatus_v1_t; ++ ++typedef nvmlEccSramErrorStatus_v1_t nvmlEccSramErrorStatus_t; ++#define nvmlEccSramErrorStatus_v1 NVML_STRUCT_VERSION(EccSramErrorStatus, 1) ++ ++/** ++ * GSP firmware ++ */ ++#define NVML_GSP_FIRMWARE_VERSION_BUF_SIZE 0x40 ++ ++/** ++ * Simplified chip architecture ++ */ ++#define NVML_DEVICE_ARCH_KEPLER 2 // Devices based on the NVIDIA Kepler architecture ++#define NVML_DEVICE_ARCH_MAXWELL 3 // Devices based on the NVIDIA Maxwell architecture ++#define NVML_DEVICE_ARCH_PASCAL 4 // Devices based on the NVIDIA Pascal architecture ++#define NVML_DEVICE_ARCH_VOLTA 5 // Devices based on the NVIDIA Volta architecture ++#define NVML_DEVICE_ARCH_TURING 6 // Devices based on the NVIDIA Turing architecture ++#define NVML_DEVICE_ARCH_AMPERE 7 // Devices based on the NVIDIA Ampere architecture ++#define NVML_DEVICE_ARCH_ADA 8 // Devices based on the NVIDIA Ada architecture ++#define NVML_DEVICE_ARCH_HOPPER 9 // Devices based on the NVIDIA Hopper architecture ++ ++#define NVML_DEVICE_ARCH_BLACKWELL 10 // Devices based on the NVIDIA Blackwell architecture ++ ++#define NVML_DEVICE_ARCH_T23X 11 // Devices based on NVIDIA Orin architecture ++ ++#define NVML_DEVICE_ARCH_UNKNOWN 0xffffffff // Anything else, presumably something newer ++ ++typedef unsigned int nvmlDeviceArchitecture_t; ++ ++/** ++ * PCI bus types ++ */ ++#define NVML_BUS_TYPE_UNKNOWN 0 ++#define NVML_BUS_TYPE_PCI 1 ++#define NVML_BUS_TYPE_PCIE 2 ++#define NVML_BUS_TYPE_FPCI 3 ++#define NVML_BUS_TYPE_AGP 4 ++ ++typedef unsigned int nvmlBusType_t; ++ ++/** ++ * Device Power Modes ++ */ ++ ++/** ++ * Device Fan control policy ++ */ ++#define NVML_FAN_POLICY_TEMPERATURE_CONTINOUS_SW 0 ++#define NVML_FAN_POLICY_MANUAL 1 ++ ++typedef unsigned int nvmlFanControlPolicy_t; ++ ++/** ++ * Device Power Source ++ */ ++#define NVML_POWER_SOURCE_AC 0x00000000 ++#define NVML_POWER_SOURCE_BATTERY 0x00000001 ++#define NVML_POWER_SOURCE_UNDERSIZED 0x00000002 ++ ++typedef unsigned int nvmlPowerSource_t; ++ ++/* ++ * Device PCIE link Max Speed ++ */ ++#define NVML_PCIE_LINK_MAX_SPEED_INVALID 0x00000000 ++#define NVML_PCIE_LINK_MAX_SPEED_2500MBPS 0x00000001 ++#define NVML_PCIE_LINK_MAX_SPEED_5000MBPS 0x00000002 ++#define NVML_PCIE_LINK_MAX_SPEED_8000MBPS 0x00000003 ++#define NVML_PCIE_LINK_MAX_SPEED_16000MBPS 0x00000004 ++#define NVML_PCIE_LINK_MAX_SPEED_32000MBPS 0x00000005 ++#define NVML_PCIE_LINK_MAX_SPEED_64000MBPS 0x00000006 ++ ++/* ++ * Adaptive clocking status ++ */ ++#define NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED 0x00000000 ++#define NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED 0x00000001 ++ ++#define NVML_MAX_GPU_UTILIZATIONS 8 ++ ++/** ++ * Represents the GPU utilization domains ++ */ ++typedef enum nvmlGpuUtilizationDomainId_t ++{ ++ NVML_GPU_UTILIZATION_DOMAIN_GPU = 0, //!< Graphics engine domain ++ NVML_GPU_UTILIZATION_DOMAIN_FB = 1, //!< Frame buffer domain ++ NVML_GPU_UTILIZATION_DOMAIN_VID = 2, //!< Video engine domain ++ NVML_GPU_UTILIZATION_DOMAIN_BUS = 3, //!< Bus interface domain ++} nvmlGpuUtilizationDomainId_t; ++ ++typedef struct nvmlGpuDynamicPstatesInfo_st ++{ ++ unsigned int flags; //!< Reserved for future use ++ struct ++ { ++ unsigned int bIsPresent; //!< Set if this utilization domain is present on this GPU ++ unsigned int percentage; //!< Percentage of time where the domain is considered busy in the last 1-second interval ++ unsigned int incThreshold; //!< Utilization threshold that can trigger a perf-increasing P-State change when crossed ++ unsigned int decThreshold; //!< Utilization threshold that can trigger a perf-decreasing P-State change when crossed ++ } utilization[NVML_MAX_GPU_UTILIZATIONS]; ++} nvmlGpuDynamicPstatesInfo_t; ++ ++/* ++ * PCIe outbound/inbound atomic operations capability ++ */ ++#define NVML_PCIE_ATOMICS_CAP_FETCHADD32 0x01 ++#define NVML_PCIE_ATOMICS_CAP_FETCHADD64 0x02 ++#define NVML_PCIE_ATOMICS_CAP_SWAP32 0x04 ++#define NVML_PCIE_ATOMICS_CAP_SWAP64 0x08 ++#define NVML_PCIE_ATOMICS_CAP_CAS32 0x10 ++#define NVML_PCIE_ATOMICS_CAP_CAS64 0x20 ++#define NVML_PCIE_ATOMICS_CAP_CAS128 0x40 ++#define NVML_PCIE_ATOMICS_OPS_MAX 7 ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @addtogroup virtualGPU vGPU Enums, Constants, Structs ++ * @{ ++ */ ++/***************************************************************************************************/ ++/** @defgroup nvmlVirtualGpuEnums vGPU Enums ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/*! ++ * GPU virtualization mode types. ++ */ ++typedef enum nvmlGpuVirtualizationMode { ++ NVML_GPU_VIRTUALIZATION_MODE_NONE = 0, //!< Represents Bare Metal GPU ++ NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1, //!< Device is associated with GPU-Passthorugh ++ NVML_GPU_VIRTUALIZATION_MODE_VGPU = 2, //!< Device is associated with vGPU inside virtual machine. ++ NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3, //!< Device is associated with VGX hypervisor in vGPU mode ++ NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4 //!< Device is associated with VGX hypervisor in vSGA mode ++} nvmlGpuVirtualizationMode_t; ++ ++/** ++ * Host vGPU modes ++ */ ++typedef enum nvmlHostVgpuMode_enum ++{ ++ NVML_HOST_VGPU_MODE_NON_SRIOV = 0, //!< Non SR-IOV mode ++ NVML_HOST_VGPU_MODE_SRIOV = 1 //!< SR-IOV mode ++} nvmlHostVgpuMode_t; ++ ++/*! ++ * Types of VM identifiers ++ */ ++typedef enum nvmlVgpuVmIdType { ++ NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID ++ NVML_VGPU_VM_ID_UUID = 1 //!< VM ID represents UUID ++} nvmlVgpuVmIdType_t; ++ ++/** ++ * vGPU GUEST info state ++ */ ++typedef enum nvmlVgpuGuestInfoState_enum ++{ ++ NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0, //!< Guest-dependent fields uninitialized ++ NVML_VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED = 1 //!< Guest-dependent fields initialized ++} nvmlVgpuGuestInfoState_t; ++ ++/** ++ * vGPU software licensable features ++ */ ++typedef enum { ++ NVML_GRID_LICENSE_FEATURE_CODE_UNKNOWN = 0, //!< Unknown ++ NVML_GRID_LICENSE_FEATURE_CODE_VGPU = 1, //!< Virtual GPU ++ NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX = 2, //!< Nvidia RTX ++ NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX, //!< Deprecated, do not use. ++ NVML_GRID_LICENSE_FEATURE_CODE_GAMING = 3, //!< Gaming ++ NVML_GRID_LICENSE_FEATURE_CODE_COMPUTE = 4 //!< Compute ++} nvmlGridLicenseFeatureCode_t; ++ ++/** ++ * Status codes for license expiry ++ */ ++#define NVML_GRID_LICENSE_EXPIRY_NOT_AVAILABLE 0 //!< Expiry information not available ++#define NVML_GRID_LICENSE_EXPIRY_INVALID 1 //!< Invalid expiry or error fetching expiry ++#define NVML_GRID_LICENSE_EXPIRY_VALID 2 //!< Valid expiry ++#define NVML_GRID_LICENSE_EXPIRY_NOT_APPLICABLE 3 //!< Expiry not applicable ++#define NVML_GRID_LICENSE_EXPIRY_PERMANENT 4 //!< Permanent expiry ++ ++/** ++ * vGPU queryable capabilities ++ */ ++typedef enum nvmlVgpuCapability_enum ++{ ++ NVML_VGPU_CAP_NVLINK_P2P = 0, //!< P2P over NVLink is supported ++ NVML_VGPU_CAP_GPUDIRECT = 1, //!< GPUDirect capability is supported ++ NVML_VGPU_CAP_MULTI_VGPU_EXCLUSIVE = 2, //!< vGPU profile cannot be mixed with other vGPU profiles in same VM ++ NVML_VGPU_CAP_EXCLUSIVE_TYPE = 3, //!< vGPU profile cannot run on a GPU alongside other profiles of different type ++ NVML_VGPU_CAP_EXCLUSIVE_SIZE = 4, //!< vGPU profile cannot run on a GPU alongside other profiles of different size ++ // Keep this last ++ NVML_VGPU_CAP_COUNT ++} nvmlVgpuCapability_t; ++ ++/** ++* vGPU driver queryable capabilities ++*/ ++typedef enum nvmlVgpuDriverCapability_enum ++{ ++ NVML_VGPU_DRIVER_CAP_HETEROGENEOUS_MULTI_VGPU = 0, //!< Supports mixing of different vGPU profiles within one guest VM ++ NVML_VGPU_DRIVER_CAP_WARM_UPDATE = 1, //!< Supports FSR and warm update of vGPU host driver without terminating the running guest VM ++ // Keep this last ++ NVML_VGPU_DRIVER_CAP_COUNT ++} nvmlVgpuDriverCapability_t; ++ ++/** ++* Device vGPU queryable capabilities ++*/ ++typedef enum nvmlDeviceVgpuCapability_enum ++{ ++ NVML_DEVICE_VGPU_CAP_FRACTIONAL_MULTI_VGPU = 0, //!< Query if the fractional vGPU profiles on this GPU can be used in multi-vGPU configurations ++ NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_PROFILES = 1, //!< Query if the GPU support concurrent execution of timesliced vGPU profiles of differing types ++ NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_SIZES = 2, //!< Query if the GPU support concurrent execution of timesliced vGPU profiles of differing framebuffer sizes ++ NVML_DEVICE_VGPU_CAP_READ_DEVICE_BUFFER_BW = 3, //!< Query the GPU's read_device_buffer expected bandwidth capacity in megabytes per second ++ NVML_DEVICE_VGPU_CAP_WRITE_DEVICE_BUFFER_BW = 4, //!< Query the GPU's write_device_buffer expected bandwidth capacity in megabytes per second ++ NVML_DEVICE_VGPU_CAP_DEVICE_STREAMING = 5, //!< Query if vGPU profiles on the GPU supports migration data streaming ++ NVML_DEVICE_VGPU_CAP_MINI_QUARTER_GPU = 6, //!< Set/Get support for mini-quarter vGPU profiles ++ NVML_DEVICE_VGPU_CAP_COMPUTE_MEDIA_ENGINE_GPU = 7, //!< Set/Get support for compute media engine vGPU profiles ++ NVML_DEVICE_VGPU_CAP_WARM_UPDATE = 8, //!< Query if the GPU supports FSR and warm update ++ // Keep this last ++ NVML_DEVICE_VGPU_CAP_COUNT ++} nvmlDeviceVgpuCapability_t; ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++ ++/** @defgroup nvmlVgpuConstants vGPU Constants ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense ++ */ ++#define NVML_GRID_LICENSE_BUFFER_SIZE 128 ++ ++#define NVML_VGPU_NAME_BUFFER_SIZE 64 ++ ++#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3 ++ ++#define INVALID_GPU_INSTANCE_PROFILE_ID 0xFFFFFFFF ++ ++#define INVALID_GPU_INSTANCE_ID 0xFFFFFFFF ++ ++#define NVML_INVALID_VGPU_PLACEMENT_ID 0xFFFF ++ ++/*! ++ * Macros for vGPU instance's virtualization capabilities bitfield. ++ */ ++#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 ++#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 ++#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 ++ ++/*! ++ * Macros for pGPU's virtualization capabilities bitfield. ++ */ ++#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 ++#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 ++#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlVgpuStructs vGPU Structs ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++typedef unsigned int nvmlVgpuTypeId_t; ++ ++typedef unsigned int nvmlVgpuInstance_t; ++ ++/** ++ * Structure to store the vGPU heterogeneous mode of device -- version 1 ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ unsigned int mode; //!< The vGPU heterogeneous mode ++} nvmlVgpuHeterogeneousMode_v1_t; ++typedef nvmlVgpuHeterogeneousMode_v1_t nvmlVgpuHeterogeneousMode_t; ++#define nvmlVgpuHeterogeneousMode_v1 NVML_STRUCT_VERSION(VgpuHeterogeneousMode, 1) ++ ++/** ++ * Structure to store the placement ID of vGPU instance -- version 1 ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ unsigned int placementId; //!< Placement ID of the active vGPU instance ++} nvmlVgpuPlacementId_v1_t; ++typedef nvmlVgpuPlacementId_v1_t nvmlVgpuPlacementId_t; ++#define nvmlVgpuPlacementId_v1 NVML_STRUCT_VERSION(VgpuPlacementId, 1) ++ ++/** ++ * Structure to store the list of vGPU placements -- version 1 ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ unsigned int placementSize; //!< The number of slots occupied by the vGPU type ++ unsigned int count; //!< Count of placement IDs fetched ++ unsigned int *placementIds; //!< Placement IDs for the vGPU type ++} nvmlVgpuPlacementList_v1_t; ++typedef nvmlVgpuPlacementList_v1_t nvmlVgpuPlacementList_t; ++#define nvmlVgpuPlacementList_v1 NVML_STRUCT_VERSION(VgpuPlacementList, 1) ++ ++/** ++ * Structure to store BAR1 size information of vGPU type -- Version 1 ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ unsigned long long bar1Size; //!< BAR1 size in megabytes ++} nvmlVgpuTypeBar1Info_v1_t; ++typedef nvmlVgpuTypeBar1Info_v1_t nvmlVgpuTypeBar1Info_t; ++#define nvmlVgpuTypeBar1Info_v1 NVML_STRUCT_VERSION(VgpuTypeBar1Info, 1) ++ ++/** ++ * Structure to store Utilization Value and vgpuInstance ++ */ ++typedef struct nvmlVgpuInstanceUtilizationSample_st ++{ ++ nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance ++ unsigned long long timeStamp; //!< CPU Timestamp in microseconds ++ nvmlValue_t smUtil; //!< SM (3D/Compute) Util Value ++ nvmlValue_t memUtil; //!< Frame Buffer Memory Util Value ++ nvmlValue_t encUtil; //!< Encoder Util Value ++ nvmlValue_t decUtil; //!< Decoder Util Value ++} nvmlVgpuInstanceUtilizationSample_t; ++ ++/** ++ * Structure to store Utilization Value and vgpuInstance Info -- Version 1 ++ */ ++typedef struct ++{ ++ unsigned long long timeStamp; //!< CPU Timestamp in microseconds ++ nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance ++ nvmlValue_t smUtil; //!< SM (3D/Compute) Util Value ++ nvmlValue_t memUtil; //!< Frame Buffer Memory Util Value ++ nvmlValue_t encUtil; //!< Encoder Util Value ++ nvmlValue_t decUtil; //!< Decoder Util Value ++ nvmlValue_t jpgUtil; //!< Jpeg Util Value ++ nvmlValue_t ofaUtil; //!< Ofa Util Value ++} nvmlVgpuInstanceUtilizationInfo_v1_t; ++ ++/** ++ * Structure to store recent utilization for vGPU instances running on a device -- version 1 ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ nvmlValueType_t sampleValType; //!< Hold the type of returned sample values ++ unsigned int vgpuInstanceCount; //!< Hold the number of vGPU instances ++ unsigned long long lastSeenTimeStamp; //!< Return only samples with timestamp greater than lastSeenTimeStamp ++ nvmlVgpuInstanceUtilizationInfo_v1_t *vgpuUtilArray; //!< The array (allocated by caller) in which vGPU utilization are returned ++} nvmlVgpuInstancesUtilizationInfo_v1_t; ++typedef nvmlVgpuInstancesUtilizationInfo_v1_t nvmlVgpuInstancesUtilizationInfo_t; ++#define nvmlVgpuInstancesUtilizationInfo_v1 NVML_STRUCT_VERSION(VgpuInstancesUtilizationInfo, 1) ++ ++/** ++ * Structure to store Utilization Value, vgpuInstance and subprocess information ++ */ ++typedef struct nvmlVgpuProcessUtilizationSample_st ++{ ++ nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance ++ unsigned int pid; //!< PID of process running within the vGPU VM ++ char processName[NVML_VGPU_NAME_BUFFER_SIZE]; //!< Name of process running within the vGPU VM ++ unsigned long long timeStamp; //!< CPU Timestamp in microseconds ++ unsigned int smUtil; //!< SM (3D/Compute) Util Value ++ unsigned int memUtil; //!< Frame Buffer Memory Util Value ++ unsigned int encUtil; //!< Encoder Util Value ++ unsigned int decUtil; //!< Decoder Util Value ++} nvmlVgpuProcessUtilizationSample_t; ++ ++/** ++ * Structure to store Utilization Value, vgpuInstance and subprocess information for process running on vGPU instance -- version 1 ++ */ ++typedef struct ++{ ++ char processName[NVML_VGPU_NAME_BUFFER_SIZE]; //!< Name of process running within the vGPU VM ++ unsigned long long timeStamp; //!< CPU Timestamp in microseconds ++ nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance ++ unsigned int pid; //!< PID of process running within the vGPU VM ++ unsigned int smUtil; //!< SM (3D/Compute) Util Value ++ unsigned int memUtil; //!< Frame Buffer Memory Util Value ++ unsigned int encUtil; //!< Encoder Util Value ++ unsigned int decUtil; //!< Decoder Util Value ++ unsigned int jpgUtil; //!< Jpeg Util Value ++ unsigned int ofaUtil; //!< Ofa Util Value ++} nvmlVgpuProcessUtilizationInfo_v1_t; ++ ++/** ++ * Structure to store recent utilization, vgpuInstance and subprocess information for processes running on vGPU instances active on a device -- version 1 ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ unsigned int vgpuProcessCount; //!< Hold the number of processes running on vGPU instances ++ unsigned long long lastSeenTimeStamp; //!< Return only samples with timestamp greater than lastSeenTimeStamp ++ nvmlVgpuProcessUtilizationInfo_v1_t *vgpuProcUtilArray; //!< The array (allocated by caller) in which utilization of processes running on vGPU instances are returned ++} nvmlVgpuProcessesUtilizationInfo_v1_t; ++typedef nvmlVgpuProcessesUtilizationInfo_v1_t nvmlVgpuProcessesUtilizationInfo_t; ++#define nvmlVgpuProcessesUtilizationInfo_v1 NVML_STRUCT_VERSION(VgpuProcessesUtilizationInfo, 1) ++ ++/** ++ * vGPU scheduler policies ++ */ ++#define NVML_VGPU_SCHEDULER_POLICY_UNKNOWN 0 ++#define NVML_VGPU_SCHEDULER_POLICY_BEST_EFFORT 1 ++#define NVML_VGPU_SCHEDULER_POLICY_EQUAL_SHARE 2 ++#define NVML_VGPU_SCHEDULER_POLICY_FIXED_SHARE 3 ++ ++#define NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT 3 ++ ++#define NVML_SCHEDULER_SW_MAX_LOG_ENTRIES 200 ++ ++#define NVML_VGPU_SCHEDULER_ARR_DEFAULT 0 ++#define NVML_VGPU_SCHEDULER_ARR_DISABLE 1 ++#define NVML_VGPU_SCHEDULER_ARR_ENABLE 2 ++ ++/** ++ * Union to represent the vGPU Scheduler Parameters ++ */ ++typedef union ++{ ++ struct ++ { ++ unsigned int avgFactor; //!< Average factor in compensating the timeslice for Adaptive Round Robin mode ++ unsigned int timeslice; //!< The timeslice in ns for each software run list as configured, or the default value otherwise ++ } vgpuSchedDataWithARR; ++ ++ struct ++ { ++ unsigned int timeslice; //!< The timeslice in ns for each software run list as configured, or the default value otherwise ++ } vgpuSchedData; ++ ++} nvmlVgpuSchedulerParams_t; ++ ++/** ++ * Structure to store the state and logs of a software runlist ++ */ ++typedef struct nvmlVgpuSchedulerLogEntries_st ++{ ++ unsigned long long timestamp; //!< Timestamp in ns when this software runlist was preeempted ++ unsigned long long timeRunTotal; //!< Total time in ns this software runlist has run ++ unsigned long long timeRun; //!< Time in ns this software runlist ran before preemption ++ unsigned int swRunlistId; //!< Software runlist Id ++ unsigned long long targetTimeSlice; //!< The actual timeslice after deduction ++ unsigned long long cumulativePreemptionTime; //!< Preemption time in ns for this SW runlist ++} nvmlVgpuSchedulerLogEntry_t; ++ ++/** ++ * Structure to store a vGPU software scheduler log ++ */ ++typedef struct nvmlVgpuSchedulerLog_st ++{ ++ unsigned int engineId; //!< Engine whose software runlist log entries are fetched ++ unsigned int schedulerPolicy; //!< Scheduler policy ++ unsigned int arrMode; //!< Adaptive Round Robin scheduler mode. One of the NVML_VGPU_SCHEDULER_ARR_*. ++ nvmlVgpuSchedulerParams_t schedulerParams; ++ unsigned int entriesCount; //!< Count of log entries fetched ++ nvmlVgpuSchedulerLogEntry_t logEntries[NVML_SCHEDULER_SW_MAX_LOG_ENTRIES]; ++} nvmlVgpuSchedulerLog_t; ++ ++/** ++ * Structure to store the vGPU scheduler state ++ */ ++typedef struct nvmlVgpuSchedulerGetState_st ++{ ++ unsigned int schedulerPolicy; //!< Scheduler policy ++ unsigned int arrMode; //!< Adaptive Round Robin scheduler mode. One of the NVML_VGPU_SCHEDULER_ARR_*. ++ nvmlVgpuSchedulerParams_t schedulerParams; ++} nvmlVgpuSchedulerGetState_t; ++ ++/** ++ * Union to represent the vGPU Scheduler set Parameters ++ */ ++typedef union ++{ ++ struct ++ { ++ unsigned int avgFactor; //!< Average factor in compensating the timeslice for Adaptive Round Robin mode ++ unsigned int frequency; //!< Frequency for Adaptive Round Robin mode ++ } vgpuSchedDataWithARR; ++ ++ struct ++ { ++ unsigned int timeslice; //!< The timeslice in ns(Nanoseconds) for each software run list as configured, or the default value otherwise ++ } vgpuSchedData; ++ ++} nvmlVgpuSchedulerSetParams_t; ++ ++/** ++ * Structure to set the vGPU scheduler state ++ */ ++typedef struct nvmlVgpuSchedulerSetState_st ++{ ++ unsigned int schedulerPolicy; //!< Scheduler policy ++ unsigned int enableARRMode; //!< Adaptive Round Robin scheduler ++ nvmlVgpuSchedulerSetParams_t schedulerParams; ++} nvmlVgpuSchedulerSetState_t; ++ ++/** ++ * Structure to store the vGPU scheduler capabilities ++ */ ++typedef struct nvmlVgpuSchedulerCapabilities_st ++{ ++ unsigned int supportedSchedulers[NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT]; //!< List the supported vGPU schedulers on the device ++ unsigned int maxTimeslice; //!< Maximum timeslice value in ns ++ unsigned int minTimeslice; //!< Minimum timeslice value in ns ++ unsigned int isArrModeSupported; //!< Flag to check Adaptive Round Robin mode enabled/disabled. ++ unsigned int maxFrequencyForARR; //!< Maximum frequency for Adaptive Round Robin mode ++ unsigned int minFrequencyForARR; //!< Minimum frequency for Adaptive Round Robin mode ++ unsigned int maxAvgFactorForARR; //!< Maximum averaging factor for Adaptive Round Robin mode ++ unsigned int minAvgFactorForARR; //!< Minimum averaging factor for Adaptive Round Robin mode ++} nvmlVgpuSchedulerCapabilities_t; ++ ++/** ++ * Structure to store the vGPU license expiry details ++ */ ++typedef struct nvmlVgpuLicenseExpiry_st ++{ ++ unsigned int year; //!< Year of license expiry ++ unsigned short month; //!< Month of license expiry ++ unsigned short day; //!< Day of license expiry ++ unsigned short hour; //!< Hour of license expiry ++ unsigned short min; //!< Minutes of license expiry ++ unsigned short sec; //!< Seconds of license expiry ++ unsigned char status; //!< License expiry status ++} nvmlVgpuLicenseExpiry_t; ++ ++/** ++ * vGPU license state ++ */ ++#define NVML_GRID_LICENSE_STATE_UNKNOWN 0 //!< Unknown state ++#define NVML_GRID_LICENSE_STATE_UNINITIALIZED 1 //!< Uninitialized state ++#define NVML_GRID_LICENSE_STATE_UNLICENSED_UNRESTRICTED 2 //!< Unlicensed unrestricted state ++#define NVML_GRID_LICENSE_STATE_UNLICENSED_RESTRICTED 3 //!< Unlicensed restricted state ++#define NVML_GRID_LICENSE_STATE_UNLICENSED 4 //!< Unlicensed state ++#define NVML_GRID_LICENSE_STATE_LICENSED 5 //!< Licensed state ++ ++typedef struct nvmlVgpuLicenseInfo_st ++{ ++ unsigned char isLicensed; //!< License status ++ nvmlVgpuLicenseExpiry_t licenseExpiry; //!< License expiry information ++ unsigned int currentState; //!< Current license state ++} nvmlVgpuLicenseInfo_t; ++ ++/** ++ * Structure to store license expiry date and time values ++ */ ++typedef struct nvmlGridLicenseExpiry_st ++{ ++ unsigned int year; //!< Year value of license expiry ++ unsigned short month; //!< Month value of license expiry ++ unsigned short day; //!< Day value of license expiry ++ unsigned short hour; //!< Hour value of license expiry ++ unsigned short min; //!< Minutes value of license expiry ++ unsigned short sec; //!< Seconds value of license expiry ++ unsigned char status; //!< License expiry status ++} nvmlGridLicenseExpiry_t; ++ ++/** ++ * Structure containing vGPU software licensable feature information ++ */ ++typedef struct nvmlGridLicensableFeature_st ++{ ++ nvmlGridLicenseFeatureCode_t featureCode; //!< Licensed feature code ++ unsigned int featureState; //!< Non-zero if feature is currently licensed, otherwise zero ++ char licenseInfo[NVML_GRID_LICENSE_BUFFER_SIZE]; //!< Deprecated. ++ char productName[NVML_GRID_LICENSE_BUFFER_SIZE]; //!< Product name of feature ++ unsigned int featureEnabled; //!< Non-zero if feature is enabled, otherwise zero ++ nvmlGridLicenseExpiry_t licenseExpiry; //!< License expiry structure containing date and time ++} nvmlGridLicensableFeature_t; ++ ++/** ++ * Structure to store vGPU software licensable features ++ */ ++typedef struct nvmlGridLicensableFeatures_st ++{ ++ int isGridLicenseSupported; //!< Non-zero if vGPU Software Licensing is supported on the system, otherwise zero ++ unsigned int licensableFeaturesCount; //!< Entries returned in \a gridLicensableFeatures array ++ nvmlGridLicensableFeature_t gridLicensableFeatures[NVML_GRID_LICENSE_FEATURE_MAX_COUNT]; //!< Array of vGPU software licensable features. ++} nvmlGridLicensableFeatures_t; ++ ++/** @} */ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlFieldValueEnums Field Value Enums ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Field Identifiers. ++ * ++ * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change. ++ */ ++#define NVML_FI_DEV_ECC_CURRENT 1 //!< Current ECC mode. 1=Active. 0=Inactive ++#define NVML_FI_DEV_ECC_PENDING 2 //!< Pending ECC mode. 1=Active. 0=Inactive ++/* ECC Count Totals */ ++#define NVML_FI_DEV_ECC_SBE_VOL_TOTAL 3 //!< Total single bit volatile ECC errors ++#define NVML_FI_DEV_ECC_DBE_VOL_TOTAL 4 //!< Total double bit volatile ECC errors ++#define NVML_FI_DEV_ECC_SBE_AGG_TOTAL 5 //!< Total single bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_DBE_AGG_TOTAL 6 //!< Total double bit aggregate (persistent) ECC errors ++/* Individual ECC locations */ ++#define NVML_FI_DEV_ECC_SBE_VOL_L1 7 //!< L1 cache single bit volatile ECC errors ++#define NVML_FI_DEV_ECC_DBE_VOL_L1 8 //!< L1 cache double bit volatile ECC errors ++#define NVML_FI_DEV_ECC_SBE_VOL_L2 9 //!< L2 cache single bit volatile ECC errors ++#define NVML_FI_DEV_ECC_DBE_VOL_L2 10 //!< L2 cache double bit volatile ECC errors ++#define NVML_FI_DEV_ECC_SBE_VOL_DEV 11 //!< Device memory single bit volatile ECC errors ++#define NVML_FI_DEV_ECC_DBE_VOL_DEV 12 //!< Device memory double bit volatile ECC errors ++#define NVML_FI_DEV_ECC_SBE_VOL_REG 13 //!< Register file single bit volatile ECC errors ++#define NVML_FI_DEV_ECC_DBE_VOL_REG 14 //!< Register file double bit volatile ECC errors ++#define NVML_FI_DEV_ECC_SBE_VOL_TEX 15 //!< Texture memory single bit volatile ECC errors ++#define NVML_FI_DEV_ECC_DBE_VOL_TEX 16 //!< Texture memory double bit volatile ECC errors ++#define NVML_FI_DEV_ECC_DBE_VOL_CBU 17 //!< CBU double bit volatile ECC errors ++#define NVML_FI_DEV_ECC_SBE_AGG_L1 18 //!< L1 cache single bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_DBE_AGG_L1 19 //!< L1 cache double bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_SBE_AGG_L2 20 //!< L2 cache single bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_DBE_AGG_L2 21 //!< L2 cache double bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_SBE_AGG_DEV 22 //!< Device memory single bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_DBE_AGG_DEV 23 //!< Device memory double bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_SBE_AGG_REG 24 //!< Register File single bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_DBE_AGG_REG 25 //!< Register File double bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_SBE_AGG_TEX 26 //!< Texture memory single bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_DBE_AGG_TEX 27 //!< Texture memory double bit aggregate (persistent) ECC errors ++#define NVML_FI_DEV_ECC_DBE_AGG_CBU 28 //!< CBU double bit aggregate ECC errors ++ ++/* Page Retirement */ ++#define NVML_FI_DEV_RETIRED_SBE 29 //!< Number of retired pages because of single bit errors ++#define NVML_FI_DEV_RETIRED_DBE 30 //!< Number of retired pages because of double bit errors ++#define NVML_FI_DEV_RETIRED_PENDING 31 //!< If any pages are pending retirement. 1=yes. 0=no. ++ ++/* NvLink Flit Error Counters */ ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 32 //!< NVLink flow control CRC Error Counter for Lane 0 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 33 //!< NVLink flow control CRC Error Counter for Lane 1 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 34 //!< NVLink flow control CRC Error Counter for Lane 2 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 35 //!< NVLink flow control CRC Error Counter for Lane 3 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 36 //!< NVLink flow control CRC Error Counter for Lane 4 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 37 //!< NVLink flow control CRC Error Counter for Lane 5 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 38 //!< NVLink flow control CRC Error Counter total for all Lanes ++ ++/* NvLink CRC Data Error Counters */ ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 39 //!< NVLink data CRC Error Counter for Lane 0 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 40 //!< NVLink data CRC Error Counter for Lane 1 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 41 //!< NVLink data CRC Error Counter for Lane 2 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 42 //!< NVLink data CRC Error Counter for Lane 3 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 43 //!< NVLink data CRC Error Counter for Lane 4 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 44 //!< NVLink data CRC Error Counter for Lane 5 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 45 //!< NvLink data CRC Error Counter total for all Lanes ++ ++/* NvLink Replay Error Counters */ ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 46 //!< NVLink Replay Error Counter for Lane 0 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 47 //!< NVLink Replay Error Counter for Lane 1 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 48 //!< NVLink Replay Error Counter for Lane 2 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 49 //!< NVLink Replay Error Counter for Lane 3 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 50 //!< NVLink Replay Error Counter for Lane 4 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 51 //!< NVLink Replay Error Counter for Lane 5 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL 52 //!< NVLink Replay Error Counter total for all Lanes ++ ++/* NvLink Recovery Error Counters */ ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 53 //!< NVLink Recovery Error Counter for Lane 0 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 54 //!< NVLink Recovery Error Counter for Lane 1 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 55 //!< NVLink Recovery Error Counter for Lane 2 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 56 //!< NVLink Recovery Error Counter for Lane 3 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 57 //!< NVLink Recovery Error Counter for Lane 4 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 58 //!< NVLink Recovery Error Counter for Lane 5 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 59 //!< NVLink Recovery Error Counter total for all Lanes ++ ++/* NvLink Bandwidth Counters */ ++/* ++ * NVML_FI_DEV_NVLINK_BANDWIDTH_* field values are now deprecated. ++ * Please use the following field values instead: ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX ++ */ ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L0 60 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 0 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L1 61 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 1 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L2 62 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 2 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L3 63 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 3 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L4 64 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 4 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L5 65 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 5 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_TOTAL 66 //!< NVLink Bandwidth Counter Total for Counter Set 0, All Lanes ++ ++/* NvLink Bandwidth Counters */ ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L0 67 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 0 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L1 68 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 1 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L2 69 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 2 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L3 70 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 3 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L4 71 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 4 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L5 72 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 5 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_TOTAL 73 //!< NVLink Bandwidth Counter Total for Counter Set 1, All Lanes ++ ++/* NVML Perf Policy Counters */ ++#define NVML_FI_DEV_PERF_POLICY_POWER 74 //!< Perf Policy Counter for Power Policy ++#define NVML_FI_DEV_PERF_POLICY_THERMAL 75 //!< Perf Policy Counter for Thermal Policy ++#define NVML_FI_DEV_PERF_POLICY_SYNC_BOOST 76 //!< Perf Policy Counter for Sync boost Policy ++#define NVML_FI_DEV_PERF_POLICY_BOARD_LIMIT 77 //!< Perf Policy Counter for Board Limit ++#define NVML_FI_DEV_PERF_POLICY_LOW_UTILIZATION 78 //!< Perf Policy Counter for Low GPU Utilization Policy ++#define NVML_FI_DEV_PERF_POLICY_RELIABILITY 79 //!< Perf Policy Counter for Reliability Policy ++#define NVML_FI_DEV_PERF_POLICY_TOTAL_APP_CLOCKS 80 //!< Perf Policy Counter for Total App Clock Policy ++#define NVML_FI_DEV_PERF_POLICY_TOTAL_BASE_CLOCKS 81 //!< Perf Policy Counter for Total Base Clocks Policy ++ ++/* Memory temperatures */ ++#define NVML_FI_DEV_MEMORY_TEMP 82 //!< Memory temperature for the device ++ ++/* Energy Counter */ ++#define NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION 83 //!< Total energy consumption for the GPU in mJ since the driver was last reloaded ++ ++/* NVLink Speed */ ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L0 84 //!< NVLink Speed in MBps for Link 0 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L1 85 //!< NVLink Speed in MBps for Link 1 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L2 86 //!< NVLink Speed in MBps for Link 2 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L3 87 //!< NVLink Speed in MBps for Link 3 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L4 88 //!< NVLink Speed in MBps for Link 4 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L5 89 //!< NVLink Speed in MBps for Link 5 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90 //!< Common NVLink Speed in MBps for active links ++ ++#define NVML_FI_DEV_NVLINK_LINK_COUNT 91 //!< Number of NVLinks present on the device ++ ++#define NVML_FI_DEV_RETIRED_PENDING_SBE 92 //!< If any pages are pending retirement due to SBE. 1=yes. 0=no. ++#define NVML_FI_DEV_RETIRED_PENDING_DBE 93 //!< If any pages are pending retirement due to DBE. 1=yes. 0=no. ++ ++#define NVML_FI_DEV_PCIE_REPLAY_COUNTER 94 //!< PCIe replay counter ++#define NVML_FI_DEV_PCIE_REPLAY_ROLLOVER_COUNTER 95 //!< PCIe replay rollover counter ++ ++/* NvLink Flit Error Counters */ ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 96 //!< NVLink flow control CRC Error Counter for Lane 6 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 97 //!< NVLink flow control CRC Error Counter for Lane 7 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 98 //!< NVLink flow control CRC Error Counter for Lane 8 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 99 //!< NVLink flow control CRC Error Counter for Lane 9 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 100 //!< NVLink flow control CRC Error Counter for Lane 10 ++#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 101 //!< NVLink flow control CRC Error Counter for Lane 11 ++ ++/* NvLink CRC Data Error Counters */ ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 102 //!< NVLink data CRC Error Counter for Lane 6 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 103 //!< NVLink data CRC Error Counter for Lane 7 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 104 //!< NVLink data CRC Error Counter for Lane 8 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 105 //!< NVLink data CRC Error Counter for Lane 9 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 106 //!< NVLink data CRC Error Counter for Lane 10 ++#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 107 //!< NVLink data CRC Error Counter for Lane 11 ++ ++/* NvLink Replay Error Counters */ ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 108 //!< NVLink Replay Error Counter for Lane 6 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 109 //!< NVLink Replay Error Counter for Lane 7 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 110 //!< NVLink Replay Error Counter for Lane 8 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 111 //!< NVLink Replay Error Counter for Lane 9 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 112 //!< NVLink Replay Error Counter for Lane 10 ++#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 113 //!< NVLink Replay Error Counter for Lane 11 ++ ++/* NvLink Recovery Error Counters */ ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 114 //!< NVLink Recovery Error Counter for Lane 6 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 115 //!< NVLink Recovery Error Counter for Lane 7 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 116 //!< NVLink Recovery Error Counter for Lane 8 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 117 //!< NVLink Recovery Error Counter for Lane 9 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 118 //!< NVLink Recovery Error Counter for Lane 10 ++#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 119 //!< NVLink Recovery Error Counter for Lane 11 ++ ++/* NvLink Bandwidth Counters */ ++/* ++ * NVML_FI_DEV_NVLINK_BANDWIDTH_* field values are now deprecated. ++ * Please use the following field values instead: ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX ++ * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX ++ */ ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L6 120 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 6 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L7 121 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 7 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L8 122 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 8 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L9 123 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 9 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L10 124 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 10 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L11 125 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 11 ++ ++/* NvLink Bandwidth Counters */ ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L6 126 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 6 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L7 127 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 7 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L8 128 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 8 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L9 129 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 9 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L10 130 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 10 ++#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L11 131 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 11 ++ ++/* NVLink Speed */ ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L6 132 //!< NVLink Speed in MBps for Link 6 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L7 133 //!< NVLink Speed in MBps for Link 7 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L8 134 //!< NVLink Speed in MBps for Link 8 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L9 135 //!< NVLink Speed in MBps for Link 9 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L10 136 //!< NVLink Speed in MBps for Link 10 ++#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L11 137 //!< NVLink Speed in MBps for Link 11 ++ ++/** ++ * NVLink throughput counters field values ++ * ++ * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t. ++ * A scopeId of UINT_MAX returns aggregate value summed up across all links ++ * for the specified counter type in fieldId. ++ */ ++#define NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX 138 //!< NVLink TX Data throughput in KiB ++#define NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX 139 //!< NVLink RX Data throughput in KiB ++#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX 140 //!< NVLink TX Data + protocol overhead in KiB ++#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX 141 //!< NVLink RX Data + protocol overhead in KiB ++ ++/* Row Remapper */ ++#define NVML_FI_DEV_REMAPPED_COR 142 //!< Number of remapped rows due to correctable errors ++#define NVML_FI_DEV_REMAPPED_UNC 143 //!< Number of remapped rows due to uncorrectable errors ++#define NVML_FI_DEV_REMAPPED_PENDING 144 //!< If any rows are pending remapping. 1=yes 0=no ++#define NVML_FI_DEV_REMAPPED_FAILURE 145 //!< If any rows failed to be remapped 1=yes 0=no ++ ++/** ++ * Remote device NVLink ID ++ * ++ * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t. ++ */ ++#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID 146 //!< Remote device NVLink ID ++ ++/** ++ * NVSwitch: connected NVLink count ++ */ ++#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT 147 //!< Number of NVLinks connected to NVSwitch ++ ++/* NvLink ECC Data Error Counters ++ * ++ * Lane ID needs to be specified in the scopeId field in nvmlFieldValue_t. ++ * ++ */ ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L0 148 //!< NVLink data ECC Error Counter for Link 0 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L1 149 //!< NVLink data ECC Error Counter for Link 1 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L2 150 //!< NVLink data ECC Error Counter for Link 2 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L3 151 //!< NVLink data ECC Error Counter for Link 3 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L4 152 //!< NVLink data ECC Error Counter for Link 4 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L5 153 //!< NVLink data ECC Error Counter for Link 5 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L6 154 //!< NVLink data ECC Error Counter for Link 6 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L7 155 //!< NVLink data ECC Error Counter for Link 7 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L8 156 //!< NVLink data ECC Error Counter for Link 8 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L9 157 //!< NVLink data ECC Error Counter for Link 9 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L10 158 //!< NVLink data ECC Error Counter for Link 10 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L11 159 //!< NVLink data ECC Error Counter for Link 11 ++#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL 160 //!< NVLink data ECC Error Counter total for all Links ++ ++#define NVML_FI_DEV_NVLINK_ERROR_DL_REPLAY 161 //!< NVLink Replay Error Counter ++#define NVML_FI_DEV_NVLINK_ERROR_DL_RECOVERY 162 //!< NVLink Recovery Error Counter ++#define NVML_FI_DEV_NVLINK_ERROR_DL_CRC 163 //!< NVLink CRC Error Counter ++#define NVML_FI_DEV_NVLINK_GET_SPEED 164 //!< NVLink Speed in MBps ++#define NVML_FI_DEV_NVLINK_GET_STATE 165 //!< NVLink State - Active,Inactive ++#define NVML_FI_DEV_NVLINK_GET_VERSION 166 //!< NVLink Version ++ ++#define NVML_FI_DEV_NVLINK_GET_POWER_STATE 167 //!< NVLink Power state. 0=HIGH_SPEED 1=LOW_SPEED ++#define NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD 168 //!< NVLink length of idle period (units can be found from ++ // NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_UNITS) before ++ // transitioning links to sleep state ++ ++#define NVML_FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER 169 //!< Device PEX error recovery counter ++ ++#define NVML_FI_DEV_C2C_LINK_COUNT 170 //!< Number of C2C Links present on the device ++#define NVML_FI_DEV_C2C_LINK_GET_STATUS 171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE ++#define NVML_FI_DEV_C2C_LINK_GET_MAX_BW 172 //!< C2C Link Speed in MBps for active links ++ ++#define NVML_FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS 173 //!< PCIe Correctable Errors Counter ++#define NVML_FI_DEV_PCIE_COUNT_NAKS_RECEIVED 174 //!< PCIe NAK Receive Counter ++#define NVML_FI_DEV_PCIE_COUNT_RECEIVER_ERROR 175 //!< PCIe Receiver Error Counter ++#define NVML_FI_DEV_PCIE_COUNT_BAD_TLP 176 //!< PCIe Bad TLP Counter ++#define NVML_FI_DEV_PCIE_COUNT_NAKS_SENT 177 //!< PCIe NAK Send Counter ++#define NVML_FI_DEV_PCIE_COUNT_BAD_DLLP 178 //!< PCIe Bad DLLP Counter ++#define NVML_FI_DEV_PCIE_COUNT_NON_FATAL_ERROR 179 //!< PCIe Non Fatal Error Counter ++#define NVML_FI_DEV_PCIE_COUNT_FATAL_ERROR 180 //!< PCIe Fatal Error Counter ++#define NVML_FI_DEV_PCIE_COUNT_UNSUPPORTED_REQ 181 //!< PCIe Unsupported Request Counter ++#define NVML_FI_DEV_PCIE_COUNT_LCRC_ERROR 182 //!< PCIe LCRC Error Counter ++#define NVML_FI_DEV_PCIE_COUNT_LANE_ERROR 183 //!< PCIe Per Lane Error Counter. ++ ++#define NVML_FI_DEV_IS_RESETLESS_MIG_SUPPORTED 184 //!< Device's Restless MIG Capability ++ ++/** ++ * Retrieves power usage for this GPU in milliwatts. ++ * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode and ++ * \ref nvmlDeviceGetPowerUsage. ++ * ++ * scopeId needs to be specified. It signifies: ++ * 0 - GPU Only Scope - Metrics for GPU are retrieved ++ * 1 - Module scope - Metrics for the module (e.g. CPU + GPU) are retrieved. ++ * Note: CPU here refers to NVIDIA CPU (e.g. Grace). x86 or non-NVIDIA ARM is not supported ++ */ ++#define NVML_FI_DEV_POWER_AVERAGE 185 //!< GPU power averaged over 1 sec interval, supported on Ampere (except GA100) or newer architectures. ++#define NVML_FI_DEV_POWER_INSTANT 186 //!< Current GPU power, supported on all architectures. ++#define NVML_FI_DEV_POWER_MIN_LIMIT 187 //!< Minimum power limit in milliwatts. ++#define NVML_FI_DEV_POWER_MAX_LIMIT 188 //!< Maximum power limit in milliwatts. ++#define NVML_FI_DEV_POWER_DEFAULT_LIMIT 189 //!< Default power limit in milliwatts (limit which device boots with). ++#define NVML_FI_DEV_POWER_CURRENT_LIMIT 190 //!< Limit currently enforced in milliwatts (This includes other limits set elsewhere. E.g. Out-of-band). ++#define NVML_FI_DEV_ENERGY 191 //!< Total energy consumption (in mJ) since the driver was last reloaded. Same as \ref NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION for the GPU. ++#define NVML_FI_DEV_POWER_REQUESTED_LIMIT 192 //!< Power limit requested by NVML or any other userspace client. ++ ++/** ++ * GPU T.Limit temperature thresholds in degree Celsius ++ * ++ * These fields are supported on Ada and later architectures and supersedes \ref nvmlDeviceGetTemperatureThreshold. ++ */ ++#define NVML_FI_DEV_TEMPERATURE_SHUTDOWN_TLIMIT 193 //!< T.Limit temperature after which GPU may shut down for HW protection ++#define NVML_FI_DEV_TEMPERATURE_SLOWDOWN_TLIMIT 194 //!< T.Limit temperature after which GPU may begin HW slowdown ++#define NVML_FI_DEV_TEMPERATURE_MEM_MAX_TLIMIT 195 //!< T.Limit temperature after which GPU may begin SW slowdown due to memory temperature ++#define NVML_FI_DEV_TEMPERATURE_GPU_MAX_TLIMIT 196 //!< T.Limit temperature after which GPU may be throttled below base clock ++ ++#define NVML_FI_DEV_PCIE_COUNT_TX_BYTES 197 //!< PCIe transmit bytes. Value can be wrapped. ++#define NVML_FI_DEV_PCIE_COUNT_RX_BYTES 198 //!< PCIe receive bytes. Value can be wrapped. ++ ++#define NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_MAX 199 //!< Max Nvlink Power Threshold. See NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD ++ ++#define NVML_FI_DEV_IS_MIG_MODE_INDEPENDENT_MIG_QUERY_CAPABLE 200 //!< MIG mode independent, MIG query capable device. 1=yes. 0=no. ++ ++#define NVML_FI_DEV_NVLINK_COUNT_XMIT_PACKETS 201 //!usedGpuMemory is not supported ++ ++ ++ unsigned long long time; //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if ++ //!< the process is not terminated ++ ++ unsigned long long startTime; //!< CPU Timestamp in usec representing start time for the process ++ ++ unsigned int isRunning; //!< Flag to represent if the process is running (1 for running, 0 for terminated) ++ ++ unsigned int reserved[5]; //!< Reserved for future use ++} nvmlAccountingStats_t; ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlEncoderStructs Encoder Structs ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Represents type of encoder for capacity can be queried ++ */ ++typedef enum nvmlEncoderQueryType_enum ++{ ++ NVML_ENCODER_QUERY_H264 = 0x00, //!< H264 encoder ++ NVML_ENCODER_QUERY_HEVC = 0x01, //!< HEVC encoder ++ NVML_ENCODER_QUERY_AV1 = 0x02, //!< AV1 encoder ++ NVML_ENCODER_QUERY_UNKNOWN = 0xFF //!< Unknown encoder ++}nvmlEncoderType_t; ++ ++/** ++ * Structure to hold encoder session data ++ */ ++typedef struct nvmlEncoderSessionInfo_st ++{ ++ unsigned int sessionId; //!< Unique session ID ++ unsigned int pid; //!< Owning process ID ++ nvmlVgpuInstance_t vgpuInstance; //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero) ++ nvmlEncoderType_t codecType; //!< Video encoder type ++ unsigned int hResolution; //!< Current encode horizontal resolution ++ unsigned int vResolution; //!< Current encode vertical resolution ++ unsigned int averageFps; //!< Moving average encode frames per second ++ unsigned int averageLatency; //!< Moving average encode latency in microseconds ++}nvmlEncoderSessionInfo_t; ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlFBCStructs Frame Buffer Capture Structures ++* @{ ++*/ ++/***************************************************************************************************/ ++ ++/** ++ * Represents frame buffer capture session type ++ */ ++typedef enum nvmlFBCSessionType_enum ++{ ++ NVML_FBC_SESSION_TYPE_UNKNOWN = 0, //!< Unknown ++ NVML_FBC_SESSION_TYPE_TOSYS, //!< ToSys ++ NVML_FBC_SESSION_TYPE_CUDA, //!< Cuda ++ NVML_FBC_SESSION_TYPE_VID, //!< Vid ++ NVML_FBC_SESSION_TYPE_HWENC //!< HEnc ++} nvmlFBCSessionType_t; ++ ++/** ++ * Structure to hold frame buffer capture sessions stats ++ */ ++typedef struct nvmlFBCStats_st ++{ ++ unsigned int sessionsCount; //!< Total no of sessions ++ unsigned int averageFPS; //!< Moving average new frames captured per second ++ unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds ++} nvmlFBCStats_t; ++ ++#define NVML_NVFBC_SESSION_FLAG_DIFFMAP_ENABLED 0x00000001 //!< Bit specifying differential map state. ++#define NVML_NVFBC_SESSION_FLAG_CLASSIFICATIONMAP_ENABLED 0x00000002 //!< Bit specifying classification map state. ++#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_NO_WAIT 0x00000004 //!< Bit specifying if capture was requested as non-blocking call. ++#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_INFINITE 0x00000008 //!< Bit specifying if capture was requested as blocking call. ++#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT 0x00000010 //!< Bit specifying if capture was requested as blocking call with timeout period. ++ ++/** ++ * Structure to hold FBC session data ++ */ ++typedef struct nvmlFBCSessionInfo_st ++{ ++ unsigned int sessionId; //!< Unique session ID ++ unsigned int pid; //!< Owning process ID ++ nvmlVgpuInstance_t vgpuInstance; //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero) ++ unsigned int displayOrdinal; //!< Display identifier ++ nvmlFBCSessionType_t sessionType; //!< Type of frame buffer capture session ++ unsigned int sessionFlags; //!< Session flags (one or more of NVML_NVFBC_SESSION_FLAG_XXX). ++ unsigned int hMaxResolution; //!< Max horizontal resolution supported by the capture session ++ unsigned int vMaxResolution; //!< Max vertical resolution supported by the capture session ++ unsigned int hResolution; //!< Horizontal resolution requested by caller in capture call ++ unsigned int vResolution; //!< Vertical resolution requested by caller in capture call ++ unsigned int averageFPS; //!< Moving average new frames captured per second ++ unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds ++} nvmlFBCSessionInfo_t; ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlDrainDefs Drain State definitions ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Is the GPU device to be removed from the kernel by nvmlDeviceRemoveGpu() ++ */ ++typedef enum nvmlDetachGpuState_enum ++{ ++ NVML_DETACH_GPU_KEEP = 0, ++ NVML_DETACH_GPU_REMOVE ++} nvmlDetachGpuState_t; ++ ++/** ++ * Parent bridge PCIe link state requested by nvmlDeviceRemoveGpu() ++ */ ++typedef enum nvmlPcieLinkState_enum ++{ ++ NVML_PCIE_LINK_KEEP = 0, ++ NVML_PCIE_LINK_SHUT_DOWN ++} nvmlPcieLinkState_t; ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlConfidentialComputingDefs Confidential Computing definitions ++ * @{ ++ */ ++/***************************************************************************************************/ ++/** ++ * Confidential Compute CPU Capabilities values ++ */ ++#define NVML_CC_SYSTEM_CPU_CAPS_NONE 0 ++#define NVML_CC_SYSTEM_CPU_CAPS_AMD_SEV 1 ++#define NVML_CC_SYSTEM_CPU_CAPS_INTEL_TDX 2 ++ ++/** ++ * Confidenial Compute GPU Capabilities values ++ */ ++#define NVML_CC_SYSTEM_GPUS_CC_NOT_CAPABLE 0 ++#define NVML_CC_SYSTEM_GPUS_CC_CAPABLE 1 ++ ++typedef struct nvmlConfComputeSystemCaps_st { ++ unsigned int cpuCaps; ++ unsigned int gpusCaps; ++} nvmlConfComputeSystemCaps_t; ++ ++/** ++ * Confidential Compute DevTools Mode values ++ */ ++#define NVML_CC_SYSTEM_DEVTOOLS_MODE_OFF 0 ++#define NVML_CC_SYSTEM_DEVTOOLS_MODE_ON 1 ++ ++/** ++ * Confidential Compute Environment values ++ */ ++#define NVML_CC_SYSTEM_ENVIRONMENT_UNAVAILABLE 0 ++#define NVML_CC_SYSTEM_ENVIRONMENT_SIM 1 ++#define NVML_CC_SYSTEM_ENVIRONMENT_PROD 2 ++ ++/** ++ * Confidential Compute Feature Status values ++ */ ++#define NVML_CC_SYSTEM_FEATURE_DISABLED 0 ++#define NVML_CC_SYSTEM_FEATURE_ENABLED 1 ++ ++typedef struct nvmlConfComputeSystemState_st { ++ unsigned int environment; ++ unsigned int ccFeature; ++ unsigned int devToolsMode; ++} nvmlConfComputeSystemState_t; ++ ++/** ++ * Confidential Compute Multigpu mode values ++ */ ++#define NVML_CC_SYSTEM_MULTIGPU_NONE 0 ++#define NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE 1 ++ ++/** ++ * Confidential Compute System settings ++ */ ++typedef struct { ++ unsigned int version; ++ unsigned int environment; ++ unsigned int ccFeature; ++ unsigned int devToolsMode; ++ unsigned int multiGpuMode; ++} nvmlSystemConfComputeSettings_v1_t; ++ ++typedef nvmlSystemConfComputeSettings_v1_t nvmlSystemConfComputeSettings_t; ++#define nvmlSystemConfComputeSettings_v1 NVML_STRUCT_VERSION(SystemConfComputeSettings, 1) ++ ++/** ++ * Protected memory size ++ */ ++typedef struct ++nvmlConfComputeMemSizeInfo_st ++{ ++ unsigned long long protectedMemSizeKib; ++ unsigned long long unprotectedMemSizeKib; ++} nvmlConfComputeMemSizeInfo_t; ++ ++/** ++ * Confidential Compute GPUs/System Ready State values ++ */ ++#define NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE 0 ++#define NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE 1 ++ ++/** ++ * GPU Certificate Details ++ */ ++#define NVML_GPU_CERT_CHAIN_SIZE 0x1000 ++#define NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE 0x1400 ++ ++typedef struct nvmlConfComputeGpuCertificate_st { ++ unsigned int certChainSize; ++ unsigned int attestationCertChainSize; ++ unsigned char certChain[NVML_GPU_CERT_CHAIN_SIZE]; ++ unsigned char attestationCertChain[NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE]; ++} nvmlConfComputeGpuCertificate_t; ++ ++/** ++ * GPU Attestation Report ++ */ ++#define NVML_CC_GPU_CEC_NONCE_SIZE 0x20 ++#define NVML_CC_GPU_ATTESTATION_REPORT_SIZE 0x2000 ++#define NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE 0x1000 ++#define NVML_CC_CEC_ATTESTATION_REPORT_NOT_PRESENT 0 ++#define NVML_CC_CEC_ATTESTATION_REPORT_PRESENT 1 ++#define NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MIN 50 ++#define NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX 75 ++ ++typedef struct nvmlConfComputeGpuAttestationReport_st { ++ unsigned int isCecAttestationReportPresent; ++ unsigned int attestationReportSize; ++ unsigned int cecAttestationReportSize; ++ unsigned char nonce[NVML_CC_GPU_CEC_NONCE_SIZE]; ++ unsigned char attestationReport[NVML_CC_GPU_ATTESTATION_REPORT_SIZE]; ++ unsigned char cecAttestationReport[NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE]; ++} nvmlConfComputeGpuAttestationReport_t; ++ ++typedef struct nvmlConfComputeSetKeyRotationThresholdInfo_st { ++ unsigned int version; ++ unsigned long long maxAttackerAdvantage; ++} nvmlConfComputeSetKeyRotationThresholdInfo_v1_t; ++ ++typedef nvmlConfComputeSetKeyRotationThresholdInfo_v1_t nvmlConfComputeSetKeyRotationThresholdInfo_t; ++#define nvmlConfComputeSetKeyRotationThresholdInfo_v1 \ ++ NVML_STRUCT_VERSION(ConfComputeSetKeyRotationThresholdInfo, 1) ++ ++typedef struct nvmlConfComputeGetKeyRotationThresholdInfo_st { ++ unsigned int version; ++ unsigned long long attackerAdvantage; ++} nvmlConfComputeGetKeyRotationThresholdInfo_v1_t; ++ ++typedef nvmlConfComputeGetKeyRotationThresholdInfo_v1_t nvmlConfComputeGetKeyRotationThresholdInfo_t; ++#define nvmlConfComputeGetKeyRotationThresholdInfo_v1 \ ++ NVML_STRUCT_VERSION(ConfComputeGetKeyRotationThresholdInfo, 1) ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlFabricDefs Fabric definitions ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++#define NVML_GPU_FABRIC_UUID_LEN 16 ++ ++#define NVML_GPU_FABRIC_STATE_NOT_SUPPORTED 0 ++#define NVML_GPU_FABRIC_STATE_NOT_STARTED 1 ++#define NVML_GPU_FABRIC_STATE_IN_PROGRESS 2 ++#define NVML_GPU_FABRIC_STATE_COMPLETED 3 ++ ++typedef unsigned char nvmlGpuFabricState_t; ++ ++/** ++ * Contains the device fabric information ++ */ ++typedef struct { ++ unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs ++ nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete". ++ unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs ++ nvmlGpuFabricState_t state; //!< Current state of GPU registration process ++} nvmlGpuFabricInfo_t; ++ ++#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED 0 ++#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE 1 ++#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE 2 ++ ++#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW 0 ++#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW 0x11 ++ ++/** ++ * GPU Fabric Health Status Mask for various fields can be obtained ++ * using the below macro. ++ * Ex - NVML_GPU_FABRIC_HEALTH_GET(var, _DEGRADED_BW) ++ */ ++#define NVML_GPU_FABRIC_HEALTH_GET(var, type) \ ++ (((var) >> NVML_GPU_FABRIC_HEALTH_MASK_SHIFT##type) & \ ++ (NVML_GPU_FABRIC_HEALTH_MASK_WIDTH##type)) ++ ++/** ++ * GPU Fabric Health Status Mask for various fields can be tested ++ * using the below macro. ++ * Ex - NVML_GPU_FABRIC_HEALTH_TEST(var, _DEGRADED_BW, _TRUE) ++ */ ++#define NVML_GPU_FABRIC_HEALTH_TEST(var, type, val) \ ++ (NVML_GPU_FABRIC_HEALTH_GET(var, type) == \ ++ NVML_GPU_FABRIC_HEALTH_MASK##type##val) ++ ++/** ++* GPU Fabric information (v2). ++* ++* Version 2 adds the \ref nvmlGpuFabricInfo_v2_t.version field ++* to the start of the structure, and the \ref nvmlGpuFabricInfo_v2_t.healthMask ++* field to the end. This structure is not backwards-compatible with ++* \ref nvmlGpuFabricInfo_t. ++*/ ++typedef struct { ++ unsigned int version; //!< Structure version identifier (set to \p nvmlGpuFabricInfo_v2) ++ unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs ++ nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete". ++ unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs ++ nvmlGpuFabricState_t state; //!< Current state of GPU registration process ++ unsigned int healthMask; //!< GPU Fabric health Status Mask ++} nvmlGpuFabricInfo_v2_t; ++ ++typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t; ++ ++/** ++* Version identifier value for \ref nvmlGpuFabricInfo_v2_t.version. ++*/ ++#define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2) ++ ++/** ++ * Device Scope - This is useful to retrieve the telemetry at GPU and module (e.g. GPU + CPU) level ++ */ ++#define NVML_POWER_SCOPE_GPU 0U //!< Targets only GPU ++#define NVML_POWER_SCOPE_MODULE 1U //!< Targets the whole module ++#define NVML_POWER_SCOPE_MEMORY 2U //!< Targets the GPU Memory ++ ++typedef unsigned char nvmlPowerScopeType_t; ++ ++/** ++ * Contains the power management limit ++ */ ++typedef struct ++{ ++ unsigned int version; //!< Structure format version (must be 1) ++ nvmlPowerScopeType_t powerScope; //!< [in] Device type: GPU or Total Module ++ unsigned int powerValueMw; //!< [out] Power value to retrieve or set in milliwatts ++} nvmlPowerValue_v2_t; ++ ++#define nvmlPowerValue_v2 NVML_STRUCT_VERSION(PowerValue, 2) ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlInitializationAndCleanup Initialization and Cleanup ++ * This chapter describes the methods that handle NVML initialization and cleanup. ++ * It is the user's responsibility to call \ref nvmlInit_v2() before calling any other methods, and ++ * nvmlShutdown() once NVML is no longer being used. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++#define NVML_INIT_FLAG_NO_GPUS 1 //!< Don't fail nvmlInit() when no GPUs are found ++#define NVML_INIT_FLAG_NO_ATTACH 2 //!< Don't attach GPUs ++ ++/** ++ * Initialize NVML, but don't initialize any GPUs yet. ++ * ++ * \note nvmlInit_v3 introduces a "flags" argument, that allows passing boolean values ++ * modifying the behaviour of nvmlInit(). ++ * \note In NVML 5.319 new nvmlInit_v2 has replaced nvmlInit"_v1" (default in NVML 4.304 and older) that ++ * did initialize all GPU devices in the system. ++ * ++ * This allows NVML to communicate with a GPU ++ * when other GPUs in the system are unstable or in a bad state. When using this API, GPUs are ++ * discovered and initialized in nvmlDeviceGetHandleBy* functions instead. ++ * ++ * \note To contrast nvmlInit_v2 with nvmlInit"_v1", NVML 4.304 nvmlInit"_v1" will fail when any detected GPU is in ++ * a bad or unstable state. ++ * ++ * For all products. ++ * ++ * This method, should be called once before invoking any other methods in the library. ++ * A reference count of the number of initializations is maintained. Shutdown only occurs ++ * when the reference count reaches zero. ++ * ++ * @return ++ * - \ref NVML_SUCCESS if NVML has been properly initialized ++ * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running ++ * - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlInit_v2(void); ++ ++/** ++ * nvmlInitWithFlags is a variant of nvmlInit(), that allows passing a set of boolean values ++ * modifying the behaviour of nvmlInit(). ++ * Other than the "flags" parameter it is completely similar to \ref nvmlInit_v2. ++ * ++ * For all products. ++ * ++ * @param flags behaviour modifier flags ++ * ++ * @return ++ * - \ref NVML_SUCCESS if NVML has been properly initialized ++ * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running ++ * - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlInitWithFlags(unsigned int flags); ++ ++/** ++ * Shut down NVML by releasing all GPU resources previously allocated with \ref nvmlInit_v2(). ++ * ++ * For all products. ++ * ++ * This method should be called after NVML work is done, once for each call to \ref nvmlInit_v2() ++ * A reference count of the number of initializations is maintained. Shutdown only occurs ++ * when the reference count reaches zero. For backwards compatibility, no error is reported if ++ * nvmlShutdown() is called more times than nvmlInit(). ++ * ++ * @return ++ * - \ref NVML_SUCCESS if NVML has been properly shut down ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlShutdown(void); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlErrorReporting Error reporting ++ * This chapter describes helper functions for error reporting routines. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Helper method for converting NVML error codes into readable strings. ++ * ++ * For all products. ++ * ++ * @param result NVML error code to convert ++ * ++ * @return String representation of the error. ++ * ++ */ ++const DECLDIR char* nvmlErrorString(nvmlReturn_t result); ++/** @} */ ++ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlConstants Constants ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetInforomVersion and \ref nvmlDeviceGetInforomImageVersion ++ */ ++#define NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE 16 ++ ++/** ++ * Buffer size guaranteed to be large enough for storing GPU identifiers. ++ */ ++#define NVML_DEVICE_UUID_BUFFER_SIZE 80 ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetUUID ++ */ ++#define NVML_DEVICE_UUID_V2_BUFFER_SIZE 96 ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetBoardPartNumber ++ */ ++#define NVML_DEVICE_PART_NUMBER_BUFFER_SIZE 80 ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlSystemGetDriverVersion ++ */ ++#define NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE 80 ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlSystemGetNVMLVersion ++ */ ++#define NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE 80 ++ ++/** ++ * Buffer size guaranteed to be large enough for storing GPU device names. ++ */ ++#define NVML_DEVICE_NAME_BUFFER_SIZE 64 ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetName ++ */ ++#define NVML_DEVICE_NAME_V2_BUFFER_SIZE 96 ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetSerial ++ */ ++#define NVML_DEVICE_SERIAL_BUFFER_SIZE 30 ++ ++/** ++ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetVbiosVersion ++ */ ++#define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE 32 ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlSystemQueries System Queries ++ * This chapter describes the queries that NVML can perform against the local system. These queries ++ * are not device-specific. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Retrieves the version of the system's graphics driver. ++ * ++ * For all products. ++ * ++ * The version identifier is an alphanumeric string. It will not exceed 80 characters in length ++ * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. ++ * ++ * @param version Reference in which to return the version identifier ++ * @param length The maximum allowed length of the string returned in \a version ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a version has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetDriverVersion(char *version, unsigned int length); ++ ++/** ++ * Retrieves the version of the NVML library. ++ * ++ * For all products. ++ * ++ * The version identifier is an alphanumeric string. It will not exceed 80 characters in length ++ * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE. ++ * ++ * @param version Reference in which to return the version identifier ++ * @param length The maximum allowed length of the string returned in \a version ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a version has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetNVMLVersion(char *version, unsigned int length); ++ ++/** ++ * Retrieves the version of the CUDA driver. ++ * ++ * For all products. ++ * ++ * The CUDA driver version returned will be retreived from the currently installed version of CUDA. ++ * If the cuda library is not found, this function will return a known supported version number. ++ * ++ * @param cudaDriverVersion Reference in which to return the version identifier ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a cudaDriverVersion has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cudaDriverVersion is NULL ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion(int *cudaDriverVersion); ++ ++/** ++ * Retrieves the version of the CUDA driver from the shared library. ++ * ++ * For all products. ++ * ++ * The returned CUDA driver version by calling cuDriverGetVersion() ++ * ++ * @param cudaDriverVersion Reference in which to return the version identifier ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a cudaDriverVersion has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cudaDriverVersion is NULL ++ * - \ref NVML_ERROR_LIBRARY_NOT_FOUND if \a libcuda.so.1 or libcuda.dll is not found ++ * - \ref NVML_ERROR_FUNCTION_NOT_FOUND if \a cuDriverGetVersion() is not found in the shared library ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion_v2(int *cudaDriverVersion); ++ ++/** ++ * Macros for converting the CUDA driver version number to Major and Minor version numbers. ++ */ ++#define NVML_CUDA_DRIVER_VERSION_MAJOR(v) ((v)/1000) ++#define NVML_CUDA_DRIVER_VERSION_MINOR(v) (((v)%1000)/10) ++ ++/** ++ * Gets name of the process with provided process id ++ * ++ * For all products. ++ * ++ * Returned process name is cropped to provided length. ++ * name string is encoded in ANSI. ++ * ++ * @param pid The identifier of the process ++ * @param name Reference in which to return the process name ++ * @param length The maximum allowed length of the string returned in \a name ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a name has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a name is NULL or \a length is 0. ++ * - \ref NVML_ERROR_NOT_FOUND if process doesn't exists ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetProcessName(unsigned int pid, char *name, unsigned int length); ++ ++/** ++ * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. ++ * ++ * For S-class products. ++ * ++ * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. ++ * The HIC must be connected to an S-class system for it to be reported by this function. ++ * ++ * @param hwbcCount Size of hwbcEntries array ++ * @param hwbcEntries Array holding information about hwbc ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries); ++ ++/** ++ * Retrieve the set of GPUs that have a CPU affinity with the given CPU number ++ * For all products. ++ * Supported on Linux only. ++ * ++ * @param cpuNumber The CPU number ++ * @param count When zero, is set to the number of matching GPUs such that \a deviceArray ++ * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count ++ * number of device handles. ++ * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature ++ * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); ++ ++/** ++ * Structure to store Driver branch information ++ */ ++typedef struct ++{ ++ unsigned int version; //!< The version number of this struct ++ char branch[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< driver branch ++} nvmlSystemDriverBranchInfo_v1_t; ++typedef nvmlSystemDriverBranchInfo_v1_t nvmlSystemDriverBranchInfo_t; ++#define nvmlSystemDriverBranchInfo_v1 NVML_STRUCT_VERSION(SystemDriverBranchInfo, 1) ++ ++/** ++ * Retrieves the driver branch of the NVIDIA driver installed on the system. ++ * ++ * For all products. ++ * ++ * The branch identifier is an alphanumeric string. It will not exceed 80 characters in length ++ * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. ++ * ++ * @param branchInfo Pointer to the driver branch information structure \a nvmlSystemDriverBranchInfo_t ++ * @param length The maximum allowed length of the driver branch string ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a branchInfo is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetDriverBranch(nvmlSystemDriverBranchInfo_t *branchInfo, unsigned int length); ++ ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlUnitQueries Unit Queries ++ * This chapter describes that queries that NVML can perform against each unit. For S-class systems only. ++ * In each case the device is identified with an nvmlUnit_t handle. This handle is obtained by ++ * calling \ref nvmlUnitGetHandleByIndex(). ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++ /** ++ * Retrieves the number of units in the system. ++ * ++ * For S-class products. ++ * ++ * @param unitCount Reference in which to return the number of units ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a unitCount has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unitCount is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetCount(unsigned int *unitCount); ++ ++/** ++ * Acquire the handle for a particular unit, based on its index. ++ * ++ * For S-class products. ++ * ++ * Valid indices are derived from the \a unitCount returned by \ref nvmlUnitGetCount(). ++ * For example, if \a unitCount is 2 the valid indices are 0 and 1, corresponding to UNIT 0 and UNIT 1. ++ * ++ * The order in which NVML enumerates units has no guarantees of consistency between reboots. ++ * ++ * @param index The index of the target unit, >= 0 and < \a unitCount ++ * @param unit Reference in which to return the unit handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a unit has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a unit is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit); ++ ++/** ++ * Retrieves the static information associated with a unit. ++ * ++ * For S-class products. ++ * ++ * See \ref nvmlUnitInfo_t for details on available unit info. ++ * ++ * @param unit The identifier of the target unit ++ * @param info Reference in which to return the unit information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a info has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a info is NULL ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info); ++ ++/** ++ * Retrieves the LED state associated with this unit. ++ * ++ * For S-class products. ++ * ++ * See \ref nvmlLedState_t for details on allowed states. ++ * ++ * @param unit The identifier of the target unit ++ * @param state Reference in which to return the current LED state ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a state has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a state is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlUnitSetLedState() ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state); ++ ++/** ++ * Retrieves the PSU stats for the unit. ++ * ++ * For S-class products. ++ * ++ * See \ref nvmlPSUInfo_t for details on available PSU info. ++ * ++ * @param unit The identifier of the target unit ++ * @param psu Reference in which to return the PSU information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a psu has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a psu is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu); ++ ++/** ++ * Retrieves the temperature readings for the unit, in degrees C. ++ * ++ * For S-class products. ++ * ++ * Depending on the product, readings may be available for intake (type=0), ++ * exhaust (type=1) and board (type=2). ++ * ++ * @param unit The identifier of the target unit ++ * @param type The type of reading to take ++ * @param temp Reference in which to return the intake temperature ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a temp has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a type is invalid or \a temp is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp); ++ ++/** ++ * Retrieves the fan speed readings for the unit. ++ * ++ * For S-class products. ++ * ++ * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info. ++ * ++ * @param unit The identifier of the target unit ++ * @param fanSpeeds Reference in which to return the fan speed information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a fanSpeeds has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a fanSpeeds is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds); ++ ++/** ++ * Retrieves the set of GPU devices that are attached to the specified unit. ++ * ++ * For S-class products. ++ * ++ * The \a deviceCount argument is expected to be set to the size of the input \a devices array. ++ * ++ * @param unit The identifier of the target unit ++ * @param deviceCount Reference in which to provide the \a devices array size, and ++ * to return the number of attached GPU devices ++ * @param devices Reference in which to return the references to the attached GPU devices ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a deviceCount and \a devices have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid, either of \a deviceCount or \a devices is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlDeviceQueries Device Queries ++ * This chapter describes that queries that NVML can perform against each device. ++ * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by ++ * calling one of \ref nvmlDeviceGetHandleByIndex_v2(), \ref nvmlDeviceGetHandleBySerial(), ++ * \ref nvmlDeviceGetHandleByPciBusId_v2(). or \ref nvmlDeviceGetHandleByUUID(). ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++ /** ++ * Retrieves the number of compute devices in the system. A compute device is a single GPU. ++ * ++ * For all products. ++ * ++ * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system ++ * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. ++ * Update your code to handle this error, or use NVML 4.304 or older nvml header file. ++ * For backward binary compatibility reasons _v1 version of the API is still present in the shared ++ * library. ++ * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. ++ * ++ * @param deviceCount Reference in which to return the number of accessible devices ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a deviceCount has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCount_v2(unsigned int *deviceCount); ++ ++/** ++ * Get attributes (engine counts etc.) for the given NVML device handle. ++ * ++ * @note This API currently only supports MIG device handles. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device NVML device handle ++ * @param attributes Device attributes ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device attributes were successfully retrieved ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle is invalid ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAttributes_v2(nvmlDevice_t device, nvmlDeviceAttributes_t *attributes); ++ ++/** ++ * Acquire the handle for a particular device, based on its index. ++ * ++ * For all products. ++ * ++ * Valid indices are derived from the \a accessibleDevices count returned by ++ * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices ++ * are 0 and 1, corresponding to GPU 0 and GPU 1. ++ * ++ * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it ++ * is recommended that devices be looked up by their PCI ids or UUID. See ++ * \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId_v2(). ++ * ++ * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. ++ * ++ * Starting from NVML 5, this API causes NVML to initialize the target GPU ++ * NVML may initialize additional GPUs if: ++ * - The target GPU is an SLI slave ++ * ++ * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system ++ * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. ++ * Update your code to handle this error, or use NVML 4.304 or older nvml header file. ++ * For backward binary compatibility reasons _v1 version of the API is still present in the shared ++ * library. ++ * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. ++ * ++ * This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index. ++ * If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't ++ * need to worry about that. ++ * ++ * @param index The index of the target GPU, >= 0 and < \a accessibleDevices ++ * @param device Reference in which to return the device handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a device is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device ++ * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetIndex ++ * @see nvmlDeviceGetCount ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t *device); ++ ++/** ++ * Acquire the handle for a particular device, based on its board serial number. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * This number corresponds to the value printed directly on the board, and to the value returned by ++ * \ref nvmlDeviceGetSerial(). ++ * ++ * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor ++ * of \ref nvmlDeviceGetHandleByUUID. ++ * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT. ++ * ++ * Starting from NVML 5, this API causes NVML to initialize the target GPU ++ * NVML may initialize additional GPUs as it searches for the target GPU ++ * ++ * @param serial The board serial number of the target GPU ++ * @param device Reference in which to return the device handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one ++ * device has the same serial (dual GPU boards) ++ * - \ref NVML_ERROR_NOT_FOUND if \a serial does not match a valid device on the system ++ * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables ++ * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs ++ * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetSerial ++ * @see nvmlDeviceGetHandleByUUID ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device); ++ ++/** ++ * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device. ++ * ++ * For all products. ++ * ++ * @param uuid The UUID of the target GPU or MIG instance ++ * @param device Reference in which to return the device handle or MIG device handle ++ * ++ * Starting from NVML 5, this API causes NVML to initialize the target GPU ++ * NVML may initialize additional GPUs as it searches for the target GPU ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a uuid is invalid or \a device is null ++ * - \ref NVML_ERROR_NOT_FOUND if \a uuid does not match a valid device on the system ++ * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables ++ * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs ++ * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetUUID ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device); ++ ++/** ++ * Acquire the handle for a particular device, based on its PCI bus id. ++ * ++ * For all products. ++ * ++ * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo_v3(). ++ * ++ * Starting from NVML 5, this API causes NVML to initialize the target GPU ++ * NVML may initialize additional GPUs if: ++ * - The target GPU is an SLI slave ++ * ++ * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND ++ * instead of NVML_ERROR_NO_PERMISSION. ++ * ++ * @param pciBusId The PCI bus id of the target GPU ++ * @param device Reference in which to return the device handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciBusId is invalid or \a device is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a pciBusId does not match a valid device on the system ++ * - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device ++ * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId_v2(const char *pciBusId, nvmlDevice_t *device); ++ ++/** ++ * Retrieves the name of this device. ++ * ++ * For all products. ++ * ++ * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not ++ * exceed 96 characters in length (including the NULL terminator). See \ref ++ * nvmlConstants::NVML_DEVICE_NAME_V2_BUFFER_SIZE. ++ * ++ * When used with MIG device handles the API returns MIG device names which can be used to identify devices ++ * based on their attributes. ++ * ++ * @param device The identifier of the target device ++ * @param name Reference in which to return the product name ++ * @param length The maximum allowed length of the string returned in \a name ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a name has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length); ++ ++/** ++ * Retrieves the brand of this device. ++ * ++ * For all products. ++ * ++ * The type is a member of \ref nvmlBrandType_t defined above. ++ * ++ * @param device The identifier of the target device ++ * @param type Reference in which to return the product brand type ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a name has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a type is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type); ++ ++/** ++ * Retrieves the NVML index of this device. ++ * ++ * For all products. ++ * ++ * Valid indices are derived from the \a accessibleDevices count returned by ++ * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices ++ * are 0 and 1, corresponding to GPU 0 and GPU 1. ++ * ++ * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it ++ * is recommended that devices be looked up by their PCI ids or GPU UUID. See ++ * \ref nvmlDeviceGetHandleByPciBusId_v2() and \ref nvmlDeviceGetHandleByUUID(). ++ * ++ * When used with MIG device handles this API returns indices that can be ++ * passed to \ref nvmlDeviceGetMigDeviceHandleByIndex to retrieve an identical handle. ++ * MIG device indices are unique within a device. ++ * ++ * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. ++ * ++ * @param device The identifier of the target device ++ * @param index Reference in which to return the NVML index of the device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a index has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetHandleByIndex() ++ * @see nvmlDeviceGetCount() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index); ++ ++/** ++ * Retrieves the globally unique board serial number associated with this device's board. ++ * ++ * For all products with an inforom. ++ * ++ * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator). ++ * This number matches the serial number tag that is physically attached to the board. See \ref ++ * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE. ++ * ++ * @param device The identifier of the target device ++ * @param serial Reference in which to return the board/module serial number ++ * @param length The maximum allowed length of the string returned in \a serial ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a serial has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); ++ ++/** ++ * Get a unique identifier for the device module on the baseboard ++ * ++ * This API retrieves a unique identifier for each GPU module that exists on a given baseboard. ++ * For non-baseboard products, this ID would always be 0. ++ * ++ * @param device The identifier of the target device ++ * @param moduleId Unique identifier for the GPU module ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a moduleId has been successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a moduleId is invalid ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetModuleId(nvmlDevice_t device, unsigned int *moduleId); ++ ++/** ++ * Retrieves the Device's C2C Mode information ++ * ++ * @param device The identifier of the target device ++ * @param c2cModeInfo Output struct containing the device's C2C Mode info ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a C2C Mode Infor query is successful ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetC2cModeInfoV(nvmlDevice_t device, nvmlC2cModeInfo_v1_t *c2cModeInfo); ++ ++/***************************************************************************************************/ ++ ++/** @defgroup nvmlAffinity CPU and Memory Affinity ++ * This chapter describes NVML operations that are associated with CPU and memory ++ * affinity. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++//! Scope of NUMA node for affinity queries ++#define NVML_AFFINITY_SCOPE_NODE 0 ++//! Scope of processor socket for affinity queries ++#define NVML_AFFINITY_SCOPE_SOCKET 1 ++ ++typedef unsigned int nvmlAffinityScope_t; ++ ++/** ++ * Retrieves an array of unsigned ints (sized to nodeSetSize) of bitmasks with ++ * the ideal memory affinity within node or socket for the device. ++ * For example, if NUMA node 0, 1 are ideal within the socket for the device and nodeSetSize == 1, ++ * result[0] = 0x3 ++ * ++ * \note If requested scope is not applicable to the target topology, the API ++ * will fall back to reporting the memory affinity for the immediate non-I/O ++ * ancestor of the device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the target device ++ * @param nodeSetSize The size of the nodeSet array that is safe to access ++ * @param nodeSet Array reference in which to return a bitmask of NODEs, 64 NODEs per ++ * unsigned long on 64-bit machines, 32 on 32-bit machines ++ * @param scope Scope that change the default behavior ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a NUMA node Affinity has been filled ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, nodeSetSize == 0, nodeSet is NULL or scope is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++ ++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryAffinity(nvmlDevice_t device, unsigned int nodeSetSize, unsigned long *nodeSet, nvmlAffinityScope_t scope); ++ ++/** ++ * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ++ * ideal CPU affinity within node or socket for the device. ++ * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, ++ * result[0] = 0x3, result[1] = 0x3 ++ * ++ * \note If requested scope is not applicable to the target topology, the API ++ * will fall back to reporting the CPU affinity for the immediate non-I/O ++ * ancestor of the device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the target device ++ * @param cpuSetSize The size of the cpuSet array that is safe to access ++ * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per ++ * unsigned long on 64-bit machines, 32 on 32-bit machines ++ * @param scope Scope that change the default behavior ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a cpuAffinity has been filled ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, cpuSet is NULL or sope is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++ ++nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet, nvmlAffinityScope_t scope); ++ ++/** ++ * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device ++ * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, ++ * result[0] = 0x3, result[1] = 0x3 ++ * This is equivalent to calling \ref nvmlDeviceGetCpuAffinityWithinScope with \ref NVML_AFFINITY_SCOPE_NODE. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the target device ++ * @param cpuSetSize The size of the cpuSet array that is safe to access ++ * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per ++ * unsigned long on 64-bit machines, 32 on 32-bit machines ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a cpuAffinity has been filled ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet); ++ ++/** ++ * Sets the ideal affinity for the calling thread and device using the guidelines ++ * given in nvmlDeviceGetCpuAffinity(). Note, this is a change as of version 8.0. ++ * Older versions set the affinity for a calling process and all children. ++ * Currently supports up to 1024 processors. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the target device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the calling process has been successfully bound ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device); ++ ++/** ++ * Clear all affinity bindings for the calling thread. Note, this is a change as of version ++ * 8.0 as older versions cleared the affinity for a calling process and all children. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the target device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the calling process has been successfully unbound ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device); ++ ++/** ++ * Get the NUMA node of the given GPU device. ++ * This only applies to platforms where the GPUs are NUMA nodes. ++ * ++ * @param[in] device The device handle ++ * @param[out] node NUMA node ID of the device ++ * ++ * @returns ++ * - \ref NVML_SUCCESS if the NUMA node is retrieved successfully ++ * - \ref NVML_ERROR_NOT_SUPPORTED if request is not supported on the current platform ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device \a node is invalid ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNumaNodeId(nvmlDevice_t device, unsigned int *node); ++/** ++ * Retrieve the common ancestor for two devices ++ * For all products. ++ * Supported on Linux only. ++ * ++ * @param device1 The identifier of the first device ++ * @param device2 The identifier of the second device ++ * @param pathInfo A \ref nvmlGpuTopologyLevel_t that gives the path type ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pathInfo has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1, or \a device2 is invalid, or \a pathInfo is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature ++ * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery ++ */ ++ ++/** @} */ ++nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo); ++ ++/** ++ * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level ++ * For all products. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the first device ++ * @param level The \ref nvmlGpuTopologyLevel_t level to search for other GPUs ++ * @param count When zero, is set to the number of matching GPUs such that \a deviceArray ++ * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count ++ * number of device handles. ++ * @param deviceArray An array of device handles for GPUs found at \a level ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature ++ * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray); ++ ++/** ++ * Retrieve the status for a given p2p capability index between a given pair of GPU ++ * ++ * @param device1 The first device ++ * @param device2 The second device ++ * @param p2pIndex p2p Capability Index being looked for between \a device1 and \a device2 ++ * @param p2pStatus Reference in which to return the status of the \a p2pIndex ++ * between \a device1 and \a device2 ++ * @return ++ * - \ref NVML_SUCCESS if \a p2pStatus has been populated ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus); ++ ++/** ++ * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string, ++ * that augments the immutable, board serial identifier. ++ * ++ * For all products. ++ * ++ * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products. ++ * It does NOT correspond to any identifier printed on the board. It will not exceed 96 characters in length ++ * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_UUID_V2_BUFFER_SIZE. ++ * ++ * When used with MIG device handles the API returns globally unique UUIDs which can be used to identify MIG ++ * devices across both GPU and MIG devices. UUIDs are immutable for the lifetime of a MIG device. ++ * ++ * @param device The identifier of the target device ++ * @param uuid Reference in which to return the GPU UUID ++ * @param length The maximum allowed length of the string returned in \a uuid ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a uuid has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a uuid is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length); ++ ++/** ++ * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for ++ * each GPU will have the form /dev/nvidia[minor number]. ++ * ++ * For all products. ++ * Supported only for Linux ++ * ++ * @param device The identifier of the target device ++ * @param minorNumber Reference in which to return the minor number for the device ++ * @return ++ * - \ref NVML_SUCCESS if the minor number is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minorNumber is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber); ++ ++/** ++ * Retrieves the the device board part number which is programmed into the board's InfoROM ++ * ++ * For all products. ++ * ++ * @param device Identifier of the target device ++ * @param partNumber Reference to the buffer to return ++ * @param length Length of the buffer reference ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a partNumber has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the needed VBIOS fields have not been filled ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a serial is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length); ++ ++/** ++ * Retrieves the version information for the device's infoROM object. ++ * ++ * For all products with an inforom. ++ * ++ * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate ++ * ECC counts. The version of the data structures in this memory may change from time to time. It will not ++ * exceed 16 characters in length (including the NULL terminator). ++ * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. ++ * ++ * See \ref nvmlInforomObject_t for details on the available infoROM objects. ++ * ++ * @param device The identifier of the target device ++ * @param object The target infoROM object ++ * @param version Reference in which to return the infoROM version ++ * @param length The maximum allowed length of the string returned in \a version ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a version has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetInforomImageVersion ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length); ++ ++/** ++ * Retrieves the global infoROM image version ++ * ++ * For all products with an inforom. ++ * ++ * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board ++ * in contrast to infoROM object version which is only an indicator of supported features. ++ * Version string will not exceed 16 characters in length (including the NULL terminator). ++ * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. ++ * ++ * @param device The identifier of the target device ++ * @param version Reference in which to return the infoROM image version ++ * @param length The maximum allowed length of the string returned in \a version ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a version has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetInforomVersion ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length); ++ ++/** ++ * Retrieves the checksum of the configuration stored in the device's infoROM. ++ * ++ * For all products with an inforom. ++ * ++ * Can be used to make sure that two GPUs have the exact same configuration. ++ * Current checksum takes into account configuration stored in PWR and ECC infoROM objects. ++ * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC) ++ * ++ * @param device The identifier of the target device ++ * @param checksum Reference in which to return the infoROM configuration checksum ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a checksum has been set ++ * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a checksum is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum); ++ ++/** ++ * Reads the infoROM from the flash and verifies the checksums. ++ * ++ * For all products with an inforom. ++ * ++ * @param device The identifier of the target device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if infoROM is not corrupted ++ * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device); ++ ++/** ++ * Retrieves the timestamp and the duration of the last flush of the BBX (blackbox) infoROM object during the current run. ++ * ++ * For all products with an inforom. ++ * ++ * @param device The identifier of the target device ++ * @param timestamp The start timestamp of the last BBX Flush ++ * @param durationUs The duration (us) of the last BBX Flush ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a timestamp and \a durationUs are successfully retrieved ++ * - \ref NVML_ERROR_NOT_READY if the BBX object has not been flushed yet ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetInforomVersion ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetLastBBXFlushTime(nvmlDevice_t device, unsigned long long *timestamp, ++ unsigned long *durationUs); ++ ++/** ++ * Retrieves the display mode for the device. ++ * ++ * For all products. ++ * ++ * This method indicates whether a physical display (e.g. monitor) is currently connected to ++ * any of the device's connectors. ++ * ++ * See \ref nvmlEnableState_t for details on allowed modes. ++ * ++ * @param device The identifier of the target device ++ * @param display Reference in which to return the display mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a display has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a display is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display); ++ ++/** ++ * Retrieves the display active state for the device. ++ * ++ * For all products. ++ * ++ * This method indicates whether a display is initialized on the device. ++ * For example whether X Server is attached to this device and has allocated memory for the screen. ++ * ++ * Display can be active even when no monitor is physically attached. ++ * ++ * See \ref nvmlEnableState_t for details on allowed modes. ++ * ++ * @param device The identifier of the target device ++ * @param isActive Reference in which to return the display active state ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a isActive has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isActive is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive); ++ ++/** ++ * Retrieves the persistence mode associated with this device. ++ * ++ * For all products. ++ * For Linux only. ++ * ++ * When driver persistence mode is enabled the driver software state is not torn down when the last ++ * client disconnects. By default this feature is disabled. ++ * ++ * See \ref nvmlEnableState_t for details on allowed modes. ++ * ++ * @param device The identifier of the target device ++ * @param mode Reference in which to return the current driver persistence mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a mode has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetPersistenceMode() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode); ++ ++/** ++ * Retrieves PCI attributes of this device. ++ * ++ * For all products. ++ * ++ * See \ref nvmlPciInfoExt_v1_t for details on the available PCI info. ++ * ++ * @param device The identifier of the target device ++ * @param pci Reference in which to return the PCI info ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pci has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPciInfoExt(nvmlDevice_t device, nvmlPciInfoExt_t *pci); ++ ++/** ++ * Retrieves the PCI attributes of this device. ++ * ++ * For all products. ++ * ++ * See \ref nvmlPciInfo_t for details on the available PCI info. ++ * ++ * @param device The identifier of the target device ++ * @param pci Reference in which to return the PCI info ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pci has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t *pci); ++ ++/** ++ * Retrieves the maximum PCIe link generation possible with this device and system ++ * ++ * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will ++ * report is generation 1. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param maxLinkGen Reference in which to return the max PCIe link generation ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a maxLinkGen has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null ++ * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen); ++ ++/** ++ * Retrieves the maximum PCIe link generation supported by this device ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param maxLinkGenDevice Reference in which to return the max PCIe link generation ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a maxLinkGenDevice has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGenDevice is null ++ * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGenDevice); ++ ++/** ++ * Retrieves the maximum PCIe link width possible with this device and system ++ * ++ * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report ++ * a max link width of 8. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param maxLinkWidth Reference in which to return the max PCIe link generation ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null ++ * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth); ++ ++/** ++ * Retrieves the current PCIe link generation ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param currLinkGen Reference in which to return the current PCIe link generation ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a currLinkGen has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null ++ * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen); ++ ++/** ++ * Retrieves the current PCIe link width ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param currLinkWidth Reference in which to return the current PCIe link generation ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a currLinkWidth has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null ++ * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth); ++ ++/** ++ * Retrieve PCIe utilization information. ++ * This function is querying a byte counter over a 20ms interval and thus is the ++ * PCIe throughput over that interval. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * This method is not supported in virtual machines running virtual GPU (vGPU). ++ * ++ * @param device The identifier of the target device ++ * @param counter The specific counter that should be queried \ref nvmlPcieUtilCounter_t ++ * @param value Reference in which to return throughput in KB/s ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a value has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a counter is invalid, or \a value is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value); ++ ++/** ++ * Retrieve the PCIe replay counter. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param value Reference in which to return the counter's value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a value has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value); ++ ++/** ++ * Retrieves the current clock speeds for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlClockType_t for details on available clock information. ++ * ++ * @param device The identifier of the target device ++ * @param type Identify which clock domain to query ++ * @param clock Reference in which to return the clock speed in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a clock has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); ++ ++/** ++ * Retrieves the maximum clock speeds for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlClockType_t for details on available clock information. ++ * ++ * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks ++ * by few MHz. ++ * ++ * @param device The identifier of the target device ++ * @param type Identify which clock domain to query ++ * @param clock Reference in which to return the clock speed in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a clock has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); ++ ++/** ++ * Retrieve the GPCCLK VF offset value ++ * @param[in] device The identifier of the target device ++ * @param[out] offset The retrieved GPCCLK VF offset value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a offset has been successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int *offset); ++ ++/** ++ * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs. ++ * Can be changed using \ref nvmlDeviceSetApplicationsClocks. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param clockType Identify which clock domain to query ++ * @param clockMHz Reference in which to return the clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a clockMHz has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); ++ ++/** ++ * Retrieves the default applications clock that GPU boots with or ++ * defaults to after \ref nvmlDeviceResetApplicationsClocks call. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param clockType Identify which clock domain to query ++ * @param clockMHz Reference in which to return the default clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a clockMHz has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * \see nvmlDeviceGetApplicationsClock ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); ++ ++/** ++ * Retrieves the clock speed for the clock specified by the clock type and clock ID. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param clockType Identify which clock domain to query ++ * @param clockId Identify which clock in the domain to query ++ * @param clockMHz Reference in which to return the clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a clockMHz has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz); ++ ++/** ++ * Retrieves the customer defined maximum boost clock speed specified by the given clock type. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param clockType Identify which clock domain to query ++ * @param clockMHz Reference in which to return the clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a clockMHz has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device or the \a clockType on this device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); ++ ++/** ++ * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param count Reference in which to provide the \a clocksMHz array size, and ++ * to return the number of elements ++ * @param clocksMHz Reference in which to return the clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of ++ * required elements) ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetApplicationsClocks ++ * @see nvmlDeviceGetSupportedGraphicsClocks ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz); ++ ++/** ++ * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param memoryClockMHz Memory clock for which to return possible graphics clocks ++ * @param count Reference in which to provide the \a clocksMHz array size, and ++ * to return the number of elements ++ * @param clocksMHz Reference in which to return the clocks in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NOT_FOUND if the specified \a memoryClockMHz is not a supported frequency ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetApplicationsClocks ++ * @see nvmlDeviceGetSupportedMemoryClocks ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz); ++ ++/** ++ * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates ++ * to maximize performance as thermal limits allow. ++ * ++ * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks. ++ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost ++ * behavior. ++ * ++ * @param device The identifier of the target device ++ * @param isEnabled Where to store the current state of Auto Boosted clocks of the target device ++ * @param defaultIsEnabled Where to store the default Auto Boosted clocks behavior of the target device that the device will ++ * revert to when no applications are using the GPU ++ * ++ * @return ++ * - \ref NVML_SUCCESS If \a isEnabled has been been set with the Auto Boosted clocks state of \a device ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isEnabled is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled); ++ ++/** ++ * Retrieves the intended operating speed of the device's fan. ++ * ++ * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the ++ * output will not match the actual fan speed. ++ * ++ * For all discrete products with dedicated fans. ++ * ++ * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. ++ * This value may exceed 100% in certain cases. ++ * ++ * @param device The identifier of the target device ++ * @param speed Reference in which to return the fan speed percentage ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a speed has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed); ++ ++ ++/** ++ * Retrieves the intended operating speed of the device's specified fan. ++ * ++ * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the ++ * output will not match the actual fan speed. ++ * ++ * For all discrete products with dedicated fans. ++ * ++ * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. ++ * This value may exceed 100% in certain cases. ++ * ++ * @param device The identifier of the target device ++ * @param fan The index of the target fan, zero indexed. ++ * @param speed Reference in which to return the fan speed percentage ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a speed has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a fan is not an acceptable index, or \a speed is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan or is newer than Maxwell ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int * speed); ++ ++/** ++ * Retrieves the intended target speed of the device's specified fan. ++ * ++ * Normally, the driver dynamically adjusts the fan based on ++ * the needs of the GPU. But when user set fan speed using nvmlDeviceSetFanSpeed_v2, ++ * the driver will attempt to make the fan achieve the setting in ++ * nvmlDeviceSetFanSpeed_v2. The actual current speed of the fan ++ * is reported in nvmlDeviceGetFanSpeed_v2. ++ * ++ * For all discrete products with dedicated fans. ++ * ++ * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. ++ * This value may exceed 100% in certain cases. ++ * ++ * @param device The identifier of the target device ++ * @param fan The index of the target fan, zero indexed. ++ * @param targetSpeed Reference in which to return the fan speed percentage ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a speed has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a fan is not an acceptable index, or \a speed is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan or is newer than Maxwell ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetTargetFanSpeed(nvmlDevice_t device, unsigned int fan, unsigned int *targetSpeed); ++ ++/** ++ * Retrieves the min and max fan speed that user can set for the GPU fan. ++ * ++ * For all cuda-capable discrete products with fans ++ * ++ * @param device The identifier of the target device ++ * @param minSpeed The minimum speed allowed to set ++ * @param maxSpeed The maximum speed allowed to set ++ * ++ * return ++ * NVML_SUCCESS if speed has been adjusted ++ * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * NVML_ERROR_INVALID_ARGUMENT if device is invalid ++ * NVML_ERROR_NOT_SUPPORTED if the device does not support this ++ * (doesn't have fans) ++ * NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxFanSpeed(nvmlDevice_t device, unsigned int * minSpeed, ++ unsigned int * maxSpeed); ++ ++/** ++ * Gets current fan control policy. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * For all cuda-capable discrete products with fans ++ * ++ * device The identifier of the target \a device ++ * policy Reference in which to return the fan control \a policy ++ * ++ * return ++ * NVML_SUCCESS if \a policy has been populated ++ * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a policy is null or the \a fan given doesn't reference ++ * a fan that exists. ++ * NVML_ERROR_NOT_SUPPORTED if the \a device is older than Maxwell ++ * NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetFanControlPolicy_v2(nvmlDevice_t device, unsigned int fan, ++ nvmlFanControlPolicy_t *policy); ++ ++/** ++ * Retrieves the number of fans on the device. ++ * ++ * For all discrete products with dedicated fans. ++ * ++ * @param device The identifier of the target device ++ * @param numFans The number of fans ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a fan number query was successful ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a numFans is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNumFans(nvmlDevice_t device, unsigned int *numFans); ++ ++/** ++ * Retrieves the current temperature readings for the device, in degrees C. ++ * ++ * For all products. ++ * ++ * See \ref nvmlTemperatureSensors_t for details on available temperature sensors. ++ * ++ * @param device The identifier of the target device ++ * @param sensorType Flag that indicates which sensor reading to retrieve ++ * @param temp Reference in which to return the temperature reading ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a temp has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have the specified sensor ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp); ++ ++ ++/** ++ * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. ++ * ++ * Note: This API is no longer the preferred interface for retrieving the following temperature thresholds ++ * on Ada and later architectures: NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, ++ * NVML_TEMPERATURE_THRESHOLD_MEM_MAX and NVML_TEMPERATURE_THRESHOLD_GPU_MAX. ++ * ++ * Support for reading these temperature thresholds for Ada and later architectures would be removed from this ++ * API in future releases. Please use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_TEMPERATURE_* fields to retrieve ++ * temperature thresholds on these architectures. ++ * ++ * @param device The identifier of the target device ++ * @param thresholdType The type of threshold value queried ++ * @param temp Reference in which to return the temperature reading ++ * @return ++ * - \ref NVML_SUCCESS if \a temp has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); ++ ++/** ++ * Used to execute a list of thermal system instructions. ++ * ++ * @param device The identifier of the target device ++ * @param sensorIndex The index of the thermal sensor ++ * @param pThermalSettings Reference in which to return the thermal sensor information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pThermalSettings has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pThermalSettings is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetThermalSettings(nvmlDevice_t device, unsigned int sensorIndex, nvmlGpuThermalSettings_t *pThermalSettings); ++ ++/** ++ * Retrieves the current performance state for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlPstates_t for details on allowed performance states. ++ * ++ * @param device The identifier of the target device ++ * @param pState Reference in which to return the performance state reading ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pState has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState); ++ ++/** ++ * Retrieves current clocks event reasons. ++ * ++ * For all fully supported products. ++ * ++ * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once. ++ * ++ * @param device The identifier of the target device ++ * @param clocksEventReasons Reference in which to return bitmask of active clocks event ++ * reasons ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a clocksEventReasons has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksEventReasons is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlClocksEventReasons ++ * @see nvmlDeviceGetSupportedClocksEventReasons ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksEventReasons(nvmlDevice_t device, unsigned long long *clocksEventReasons); ++ ++/** ++ * @deprecated Use \ref nvmlDeviceGetCurrentClocksEventReasons instead ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); ++ ++/** ++ * Retrieves bitmask of supported clocks event reasons that can be returned by ++ * \ref nvmlDeviceGetCurrentClocksEventReasons ++ * ++ * For all fully supported products. ++ * ++ * This method is not supported in virtual machines running virtual GPU (vGPU). ++ * ++ * @param device The identifier of the target device ++ * @param supportedClocksEventReasons Reference in which to return bitmask of supported ++ * clocks event reasons ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a supportedClocksEventReasons has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksEventReasons is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlClocksEventReasons ++ * @see nvmlDeviceGetCurrentClocksEventReasons ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksEventReasons(nvmlDevice_t device, unsigned long long *supportedClocksEventReasons); ++ ++/** ++ * @deprecated Use \ref nvmlDeviceGetSupportedClocksEventReasons instead ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons); ++ ++/** ++ * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization. ++ * ++ * Retrieve the current performance state for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlPstates_t for details on allowed performance states. ++ * ++ * @param device The identifier of the target device ++ * @param pState Reference in which to return the performance state reading ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pState has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState); ++ ++/** ++ * Retrieve performance monitor samples from the associated subdevice. ++ * ++ * @param device ++ * @param pDynamicPstatesInfo ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pDynamicPstatesInfo has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pDynamicPstatesInfo is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDynamicPstatesInfo(nvmlDevice_t device, nvmlGpuDynamicPstatesInfo_t *pDynamicPstatesInfo); ++ ++/** ++ * Retrieve the MemClk (Memory Clock) VF offset value. ++ * @param[in] device The identifier of the target device ++ * @param[out] offset The retrieved MemClk VF offset value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a offset has been successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMemClkVfOffset(nvmlDevice_t device, int *offset); ++ ++/** ++ * Retrieve min and max clocks of some clock domain for a given PState ++ * ++ * @param device The identifier of the target device ++ * @param type Clock domain ++ * @param pstate PState to query ++ * @param minClockMHz Reference in which to return min clock frequency ++ * @param maxClockMHz Reference in which to return max clock frequency ++ * ++ * @return ++ * - \ref NVML_SUCCESS if everything worked ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a type or \a pstate are invalid or both ++ * \a minClockMHz and \a maxClockMHz are NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxClockOfPState(nvmlDevice_t device, nvmlClockType_t type, nvmlPstates_t pstate, ++ unsigned int * minClockMHz, unsigned int * maxClockMHz); ++ ++/** ++ * Get all supported Performance States (P-States) for the device. ++ * ++ * The returned array would contain a contiguous list of valid P-States supported by ++ * the device. If the number of supported P-States is fewer than the size of the array ++ * supplied missing elements would contain \a NVML_PSTATE_UNKNOWN. ++ * ++ * The number of elements in the returned list will never exceed \a NVML_MAX_GPU_PERF_PSTATES. ++ * ++ * @param device The identifier of the target device ++ * @param pstates Container to return the list of performance states ++ * supported by device ++ * @param size Size of the supplied \a pstates array in bytes ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pstates array has been retrieved ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if the the container supplied was not large enough to ++ * hold the resulting list ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a pstates is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support performance state readings ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedPerformanceStates(nvmlDevice_t device, ++ nvmlPstates_t *pstates, unsigned int size); ++ ++/** ++ * Retrieve the GPCCLK min max VF offset value. ++ * @param[in] device The identifier of the target device ++ * @param[out] minOffset The retrieved GPCCLK VF min offset value ++ * @param[out] maxOffset The retrieved GPCCLK VF max offset value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a offset has been successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice_t device, ++ int *minOffset, int *maxOffset); ++ ++/** ++ * Retrieve the MemClk (Memory Clock) min max VF offset value. ++ * @param[in] device The identifier of the target device ++ * @param[out] minOffset The retrieved MemClk VF min offset value ++ * @param[out] maxOffset The retrieved MemClk VF max offset value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a offset has been successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice_t device, ++ int *minOffset, int *maxOffset); ++ ++/** ++ * Retrieve min, max and current clock offset of some clock domain for a given PState ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Note: \ref nvmlDeviceGetGpcClkVfOffset, \ref nvmlDeviceGetMemClkVfOffset, \ref nvmlDeviceGetGpcClkMinMaxVfOffset and ++ * \ref nvmlDeviceGetMemClkMinMaxVfOffset will be deprecated in a future release. ++ Use \ref nvmlDeviceGetClockOffsets instead. ++ * ++ * @param device The identifier of the target device ++ * @param info Structure specifying the clock type (input) and the pstate (input) ++ * retrieved clock offset value (output), min clock offset (output) ++ * and max clock offset (output) ++ * ++ * @return ++ * - \ref NVML_SUCCESS if everything worked ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a type or \a pstate are invalid or both ++ * \a minClockOffsetMHz and \a maxClockOffsetMHz are NULL ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetClockOffsets(nvmlDevice_t device, nvmlClockOffset_t *info); ++ ++/** ++ * Control current clock offset of some clock domain for a given PState ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Requires privileged user. ++ * ++ * @param device The identifier of the target device ++ * @param info Structure specifying the clock type (input), the pstate (input) ++ * and clock offset value (input) ++ * ++ * @return ++ * - \ref NVML_SUCCESS if everything worked ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a type or \a pstate are invalid or both ++ * \a clockOffsetMHz is out of allowed range. ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetClockOffsets(nvmlDevice_t device, nvmlClockOffset_t *info); ++ ++/** ++ * This API has been deprecated. ++ * ++ * Retrieves the power management mode associated with this device. ++ * ++ * For products from the Fermi family. ++ * - Requires \a NVML_INFOROM_POWER version 3.0 or higher. ++ * ++ * For from the Kepler or newer families. ++ * - Does not require \a NVML_INFOROM_POWER object. ++ * ++ * This flag indicates whether any power management algorithm is currently active on the device. An ++ * enabled state does not necessarily mean the device is being actively throttled -- only that ++ * that the driver will do so if the appropriate conditions are met. ++ * ++ * See \ref nvmlEnableState_t for details on allowed modes. ++ * ++ * @param device The identifier of the target device ++ * @param mode Reference in which to return the current power management mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a mode has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode); ++ ++/** ++ * Retrieves the power management limit associated with this device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * The power limit defines the upper boundary for the card's power draw. If ++ * the card's total power draw reaches this limit the power management algorithm kicks in. ++ * ++ * This reading is only available if power management mode is supported. ++ * See \ref nvmlDeviceGetPowerManagementMode. ++ * ++ * @param device The identifier of the target device ++ * @param limit Reference in which to return the power management limit in milliwatts ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a limit has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit); ++ ++/** ++ * Retrieves information about possible values of power management limits on this device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param minLimit Reference in which to return the minimum power management limit in milliwatts ++ * @param maxLimit Reference in which to return the maximum power management limit in milliwatts ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetPowerManagementLimit ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit); ++ ++/** ++ * Retrieves default power management limit on this device, in milliwatts. ++ * Default power management limit is a power management limit that the device boots with. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param defaultLimit Reference in which to return the default power management limit in milliwatts ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a defaultLimit has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit); ++ ++/** ++ * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. On Ampere ++ * (except GA100) or newer GPUs, the API returns power averaged over 1 sec interval. On GA100 and ++ * older architectures, instantaneous power is returned. ++ * ++ * See \ref NVML_FI_DEV_POWER_AVERAGE and \ref NVML_FI_DEV_POWER_INSTANT to query specific power ++ * values. ++ * ++ * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. ++ * ++ * @param device The identifier of the target device ++ * @param power Reference in which to return the power usage information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a power has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); ++ ++/** ++ * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded ++ * ++ * For Volta &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param energy Reference in which to return the energy consumption information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a energy has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a energy is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support energy readings ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy); ++ ++/** ++ * Get the effective power limit that the driver enforces after taking into account all limiters ++ * ++ * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere ++ * This includes the out of band power limit interface ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The device to communicate with ++ * @param limit Reference in which to return the power management limit in milliwatts ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a limit has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit); ++ ++/** ++ * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot). ++ * ++ * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. ++ * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. ++ * Not supported on Quadro ® and Tesla &tm; C-class products. ++ * ++ * @param device The identifier of the target device ++ * @param current Reference in which to return the current GOM ++ * @param pending Reference in which to return the pending GOM ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a mode has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a current or \a pending is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlGpuOperationMode_t ++ * @see nvmlDeviceSetGpuOperationMode ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending); ++ ++/** ++ * Retrieves the amount of used, free, reserved and total memory available on the device, in bytes. ++ * The reserved amount is supported on version 2 only. ++ * ++ * For all products. ++ * ++ * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits. ++ * Under WDDM most device memory is allocated and managed on startup by Windows. ++ * ++ * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated ++ * by all active channels on the device. ++ * ++ * See \ref nvmlMemory_v2_t for details on available memory info. ++ * ++ * @note In MIG mode, if device handle is provided, the API returns aggregate ++ * information, only if the caller has appropriate privileges. Per-instance ++ * information can be queried by using specific MIG device handles. ++ * ++ * @note nvmlDeviceGetMemoryInfo_v2 adds additional memory information. ++ * ++ * @note On systems where GPUs are NUMA nodes, the accuracy of FB memory utilization ++ * provided by this API depends on the memory accounting of the operating system. ++ * This is because FB memory is managed by the operating system instead of the NVIDIA GPU driver. ++ * Typically, pages allocated from FB memory are not released even after ++ * the process terminates to enhance performance. In scenarios where ++ * the operating system is under memory pressure, it may resort to utilizing FB memory. ++ * Such actions can result in discrepancies in the accuracy of memory reporting. ++ * ++ * @param device The identifier of the target device ++ * @param memory Reference in which to return the memory information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a memory has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory); ++ ++/** ++ * nvmlDeviceGetMemoryInfo_v2 accounts separately for reserved memory and includes it in the used memory amount. ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo_v2(nvmlDevice_t device, nvmlMemory_v2_t *memory); ++ ++/** ++ * Retrieves the current compute mode for the device. ++ * ++ * For all products. ++ * ++ * See \ref nvmlComputeMode_t for details on allowed compute modes. ++ * ++ * @param device The identifier of the target device ++ * @param mode Reference in which to return the current compute mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a mode has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetComputeMode() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode); ++ ++/** ++ * Retrieves the CUDA compute capability of the device. ++ * ++ * For all products. ++ * ++ * Returns the major and minor compute capability version numbers of the ++ * device. The major and minor versions are equivalent to the ++ * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and ++ * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be ++ * returned by CUDA's cuDeviceGetAttribute(). ++ * ++ * @param device The identifier of the target device ++ * @param major Reference in which to return the major CUDA compute capability ++ * @param minor Reference in which to return the minor CUDA compute capability ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a major and \a minor have been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a major or \a minor are NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor); ++ ++/** ++ * Retrieves the current and pending ECC modes for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * Only applicable to devices with ECC. ++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher. ++ * ++ * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following ++ * the next reboot. ++ * ++ * See \ref nvmlEnableState_t for details on allowed modes. ++ * ++ * @param device The identifier of the target device ++ * @param current Reference in which to return the current ECC mode ++ * @param pending Reference in which to return the pending ECC mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a current and \a pending have been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetEccMode() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending); ++ ++/** ++ * Retrieves the default ECC modes for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * Only applicable to devices with ECC. ++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher. ++ * ++ * See \ref nvmlEnableState_t for details on allowed modes. ++ * ++ * @param device The identifier of the target device ++ * @param defaultMode Reference in which to return the default ECC mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a current and \a pending have been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a default is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetEccMode() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDefaultEccMode(nvmlDevice_t device, nvmlEnableState_t *defaultMode); ++ ++/** ++ * Retrieves the device boardId from 0-N. ++ * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with ++ * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well. ++ * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across ++ * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and ++ * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will ++ * always return those values but they will always be different from each other). ++ * ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param boardId Reference in which to return the device's board ID ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a boardId has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a boardId is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId); ++ ++/** ++ * Retrieves whether the device is on a Multi-GPU Board ++ * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param multiGpuBool Reference in which to return a zero or non-zero value ++ * to indicate whether the device is on a multi GPU board ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a multiGpuBool has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a multiGpuBool is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool); ++ ++/** ++ * Retrieves the total ECC error counts for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * Only applicable to devices with ECC. ++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher. ++ * Requires ECC Mode to be enabled. ++ * ++ * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of ++ * errors across the entire device. ++ * ++ * See \ref nvmlMemoryErrorType_t for a description of available error types.\n ++ * See \ref nvmlEccCounterType_t for a description of available counter types. ++ * ++ * @param device The identifier of the target device ++ * @param errorType Flag that specifies the type of the errors. ++ * @param counterType Flag that specifies the counter-type of the errors. ++ * @param eccCounts Reference in which to return the specified ECC errors ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a eccCounts has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceClearEccErrorCounts() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts); ++ ++/** ++ * Retrieves the detailed ECC error counts for the device. ++ * ++ * @deprecated This API supports only a fixed set of ECC error locations ++ * On different GPU architectures different locations are supported ++ * See \ref nvmlDeviceGetMemoryErrorCounter ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * Only applicable to devices with ECC. ++ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts. ++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts. ++ * Requires ECC Mode to be enabled. ++ * ++ * Detailed errors provide separate ECC counts for specific parts of the memory system. ++ * ++ * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported. ++ * ++ * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n ++ * See \ref nvmlEccCounterType_t for a description of available counter types.\n ++ * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts. ++ * ++ * @param device The identifier of the target device ++ * @param errorType Flag that specifies the type of the errors. ++ * @param counterType Flag that specifies the counter-type of the errors. ++ * @param eccCounts Reference in which to return the specified ECC errors ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a eccCounts has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceClearEccErrorCounts() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts); ++ ++/** ++ * Retrieves the requested memory error counter for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts. ++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts. ++ * ++ * Only applicable to devices with ECC. ++ * ++ * Requires ECC Mode to be enabled. ++ * ++ * @note On MIG-enabled GPUs, per instance information can be queried using specific ++ * MIG device handles. Per instance information is currently only supported for ++ * non-DRAM uncorrectable volatile errors. Querying volatile errors using device ++ * handles is currently not supported. ++ * ++ * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n ++ * See \ref nvmlEccCounterType_t for a description of available counter types.\n ++ * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n ++ * ++ * @param device The identifier of the target device ++ * @param errorType Flag that specifies the type of error. ++ * @param counterType Flag that specifies the counter-type of the errors. ++ * @param locationType Specifies the location of the counter. ++ * @param count Reference in which to return the ECC counter ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a count has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a bitTyp,e \a counterType or \a locationType is ++ * invalid, or \a count is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support ECC error reporting in the specified memory ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, ++ nvmlEccCounterType_t counterType, ++ nvmlMemoryLocation_t locationType, unsigned long long *count); ++ ++/** ++ * Retrieves the current utilization rates for the device's major subsystems. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlUtilization_t for details on available utilization rates. ++ * ++ * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. ++ * This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. ++ * ++ * @note On MIG-enabled GPUs, querying device utilization rates is not currently supported. ++ * ++ * @param device The identifier of the target device ++ * @param utilization Reference in which to return the utilization information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a utilization has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a utilization is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization); ++ ++/** ++ * Retrieves the current utilization and sampling size in microseconds for the Encoder ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @note On MIG-enabled GPUs, querying encoder utilization is not currently supported. ++ * ++ * @param device The identifier of the target device ++ * @param utilization Reference to an unsigned int for encoder utilization info ++ * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a utilization has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); ++ ++/** ++ * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param encoderQueryType Type of encoder to query ++ * @param encoderCapacity Reference to an unsigned int for the encoder capacity ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a encoderCapacity is fetched ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a encoderCapacity is NULL, or \a device or \a encoderQueryType ++ * are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if device does not support the encoder specified in \a encodeQueryType ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity); ++ ++/** ++ * Retrieves the current encoder statistics for a given device. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param sessionCount Reference to an unsigned int for count of active encoder sessions ++ * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions ++ * @param averageLatency Reference to an unsigned int for encode latency in microseconds ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount, or \a device or \a averageFps, ++ * or \a averageLatency is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount, ++ unsigned int *averageFps, unsigned int *averageLatency); ++ ++/** ++ * Retrieves information about active encoder sessions on a target device. ++ * ++ * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The ++ * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions ++ * written to the buffer. ++ * ++ * If the supplied buffer is not large enough to accommodate the active session array, the function returns ++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. ++ * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return ++ * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param sessionCount Reference to caller supplied array size, and returns the number of sessions. ++ * @param sessionInfos Reference in which to return the session information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a sessionInfos is fetched ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos); ++ ++/** ++ * Retrieves the current utilization and sampling size in microseconds for the Decoder ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported. ++ * ++ * @param device The identifier of the target device ++ * @param utilization Reference to an unsigned int for decoder utilization info ++ * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a utilization has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); ++ ++/** ++ * Retrieves the current utilization and sampling size in microseconds for the JPG ++ * ++ * %TURING_OR_NEWER% ++ * ++ * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported. ++ * ++ * @param device The identifier of the target device ++ * @param utilization Reference to an unsigned int for jpg utilization info ++ * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a utilization has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetJpgUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); ++ ++/** ++ * Retrieves the current utilization and sampling size in microseconds for the OFA (Optical Flow Accelerator) ++ * ++ * %TURING_OR_NEWER% ++ * ++ * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported. ++ * ++ * @param device The identifier of the target device ++ * @param utilization Reference to an unsigned int for ofa utilization info ++ * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a utilization has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetOfaUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); ++ ++/** ++* Retrieves the active frame buffer capture sessions statistics for a given device. ++* ++* For Maxwell &tm; or newer fully supported devices. ++* ++* @param device The identifier of the target device ++* @param fbcStats Reference to nvmlFBCStats_t structure containing NvFBC stats ++* ++* @return ++* - \ref NVML_SUCCESS if \a fbcStats is fetched ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a fbcStats is NULL ++* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t *fbcStats); ++ ++/** ++* Retrieves information about active frame buffer capture sessions on a target device. ++* ++* An array of active FBC sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The ++* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions ++* written to the buffer. ++* ++* If the supplied buffer is not large enough to accommodate the active session array, the function returns ++* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. ++* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return ++* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. ++* ++* For Maxwell &tm; or newer fully supported devices. ++* ++* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may ++* be zero if there are no new frames captured since the session started. ++* ++* @param device The identifier of the target device ++* @param sessionCount Reference to caller supplied array size, and returns the number of sessions. ++* @param sessionInfo Reference in which to return the session information ++* ++* @return ++* - \ref NVML_SUCCESS if \a sessionInfo is fetched ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. ++* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); ++ ++/** ++ * Retrieves the current and pending driver model for the device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * For windows only. ++ * ++ * On Windows platforms the device driver can run in either WDDM, MCDM or WDM (TCC) modes. If a display is attached ++ * to the device it must run in WDDM mode. MCDM mode is preferred if a display is not attached. TCC mode is deprecated. ++ * ++ * See \ref nvmlDriverModel_t for details on available driver models. ++ * ++ * @param device The identifier of the target device ++ * @param current Reference in which to return the current driver model ++ * @param pending Reference in which to return the pending driver model ++ * ++ * @return ++ * - \ref NVML_SUCCESS if either \a current and/or \a pending have been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or both \a current and \a pending are NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceSetDriverModel_v2() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel_v2(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); ++ ++/** ++ * Get VBIOS version of the device. ++ * ++ * For all products. ++ * ++ * The VBIOS version may change from time to time. It will not exceed 32 characters in length ++ * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE. ++ * ++ * @param device The identifier of the target device ++ * @param version Reference to which to return the VBIOS version ++ * @param length The maximum allowed length of the string returned in \a version ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a version has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length); ++ ++/** ++ * Get Bridge Chip Information for all the bridge chips on the board. ++ * ++ * For all fully supported products. ++ * Only applicable to multi-GPU products. ++ * ++ * @param device The identifier of the target device ++ * @param bridgeHierarchy Reference to the returned bridge chip Hierarchy ++ * ++ * @return ++ * - \ref NVML_SUCCESS if bridge chip exists ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a bridgeInfo is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if bridge chip not supported on the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy); ++ ++/** ++ * Get information about processes with a compute context on a device ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * This function returns information only about compute running processes (e.g. CUDA application which have ++ * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. ++ * ++ * To query the current number of running compute processes, call this function with *infoCount = 0. The ++ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call ++ * \a infos is allowed to be NULL. ++ * ++ * The usedGpuMemory field returned is all of the memory used by the application. ++ * ++ * Keep in mind that information returned by this call is dynamic and the number of elements might change in ++ * time. Allocate more space for \a infos table in case new compute processes are spawned. ++ * ++ * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if ++ * the caller has appropriate privileges. Per-instance information can be queried by using ++ * specific MIG device handles. ++ * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. ++ * ++ * @param device The device handle or MIG device handle ++ * @param infoCount Reference in which to provide the \a infos array size, and ++ * to return the number of returned elements ++ * @param infos Reference in which to return the process information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small ++ * \a infoCount will contain minimal amount of space necessary for ++ * the call to complete ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see \ref nvmlSystemGetProcessName ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); ++ ++/** ++ * Get information about processes with a graphics context on a device ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * This function returns information only about graphics based processes ++ * (eg. applications using OpenGL, DirectX) ++ * ++ * To query the current number of running graphics processes, call this function with *infoCount = 0. The ++ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call ++ * \a infos is allowed to be NULL. ++ * ++ * The usedGpuMemory field returned is all of the memory used by the application. ++ * ++ * Keep in mind that information returned by this call is dynamic and the number of elements might change in ++ * time. Allocate more space for \a infos table in case new graphics processes are spawned. ++ * ++ * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if ++ * the caller has appropriate privileges. Per-instance information can be queried by using ++ * specific MIG device handles. ++ * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. ++ * ++ * @param device The device handle or MIG device handle ++ * @param infoCount Reference in which to provide the \a infos array size, and ++ * to return the number of returned elements ++ * @param infos Reference in which to return the process information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small ++ * \a infoCount will contain minimal amount of space necessary for ++ * the call to complete ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see \ref nvmlSystemGetProcessName ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); ++ ++/** ++ * Get information about processes with a Multi-Process Service (MPS) compute context on a device ++ * ++ * For Volta &tm; or newer fully supported devices. ++ * ++ * This function returns information only about compute running processes (e.g. CUDA application which have ++ * active context) utilizing MPS. Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by ++ * this function. ++ * ++ * To query the current number of running compute processes, call this function with *infoCount = 0. The ++ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call ++ * \a infos is allowed to be NULL. ++ * ++ * The usedGpuMemory field returned is all of the memory used by the application. ++ * ++ * Keep in mind that information returned by this call is dynamic and the number of elements might change in ++ * time. Allocate more space for \a infos table in case new compute processes are spawned. ++ * ++ * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if ++ * the caller has appropriate privileges. Per-instance information can be queried by using ++ * specific MIG device handles. ++ * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. ++ * ++ * @param device The device handle or MIG device handle ++ * @param infoCount Reference in which to provide the \a infos array size, and ++ * to return the number of returned elements ++ * @param infos Reference in which to return the process information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small ++ * \a infoCount will contain minimal amount of space necessary for ++ * the call to complete ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see \ref nvmlSystemGetProcessName ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); ++ ++/** ++ * Get information about running processes on a device for input context ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * This function returns information only about running processes (e.g. CUDA application which have ++ * active context). ++ * ++ * To determine the size of the \a plist->procArray array to allocate, call the function with ++ * \a plist->numProcArrayEntries set to zero and \a plist->procArray set to NULL. The return ++ * code will be either NVML_ERROR_INSUFFICIENT_SIZE (if there are valid processes of type ++ * \a plist->mode to report on, in which case the \a plist->numProcArrayEntries field will ++ * indicate the required number of entries in the array) or NVML_SUCCESS (if no processes of type ++ * \a plist->mode exist). ++ * ++ * The usedGpuMemory field returned is all of the memory used by the application. ++ * The usedGpuCcProtectedMemory field returned is all of the protected memory used by the application. ++ * ++ * Keep in mind that information returned by this call is dynamic and the number of elements might change in ++ * time. Allocate more space for \a plist->procArray table in case new processes are spawned. ++ * ++ * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if ++ * the caller has appropriate privileges. Per-instance information can be queried by using ++ * specific MIG device handles. ++ * Querying per-instance information using MIG device handles is not supported if the device is in ++ * vGPU Host virtualization mode. ++ * Protected memory usage is currently not available in MIG mode and in windows. ++ * ++ * @param device The device handle or MIG device handle ++ * @param plist Reference in which to process detail list ++ * \a plist->version The api version ++ * \a plist->mode The process mode ++ * \a plist->procArray Reference in which to return the process information ++ * \a plist->numProcArrayEntries Proc array size of returned entries ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a plist->numprocArrayEntries and \a plist->procArray have been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a plist->numprocArrayEntries indicates that the \a plist->procArray is too small ++ * \a plist->numprocArrayEntries will contain minimal amount of space necessary for ++ * the call to complete ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a plist is NULL, \a plist->version is invalid, ++ * \a plist->mode is invalid, ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetRunningProcessDetailList(nvmlDevice_t device, nvmlProcessDetailList_t *plist); ++ ++/** ++ * Check if the GPU devices are on the same physical board. ++ * ++ * For all fully supported products. ++ * ++ * @param device1 The first GPU device ++ * @param device2 The second GPU device ++ * @param onSameBoard Reference in which to return the status. ++ * Non-zero indicates that the GPUs are on the same board. ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a onSameBoard has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard); ++ ++/** ++ * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs. ++ * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions. ++ * ++ * For all fully supported products. ++ * ++ * @param device The identifier of the target device ++ * @param apiType Target API type for this operation ++ * @param isRestricted Reference in which to return the current restriction ++ * NVML_FEATURE_ENABLED indicates that the API is root-only ++ * NVML_FEATURE_DISABLED indicates that the API is accessible to all users ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a isRestricted has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device or the device does not support ++ * the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is ++ * not supported by the device) ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlRestrictedAPI_t ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted); ++ ++/** ++ * Gets recent samples for the GPU. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by ++ * the driver. ++ * ++ * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t. ++ * ++ * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. ++ * The returned samplesCount will provide the number of samples that can be queried. The user needs to ++ * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t). ++ * ++ * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the ++ * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query ++ * to get more recent samples. ++ * ++ * This method fetches the number of entries which can be accommodated in the provided samples array, and the ++ * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this ++ * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost. ++ * ++ * @note On MIG-enabled GPUs, querying the following sample types, NVML_GPU_UTILIZATION_SAMPLES, NVML_MEMORY_UTILIZATION_SAMPLES ++ * NVML_ENC_UTILIZATION_SAMPLES and NVML_DEC_UTILIZATION_SAMPLES, is not currently supported. ++ * ++ * @param device The identifier for the target device ++ * @param type Type of sampling event ++ * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. ++ * @param sampleValType Output parameter to represent the type of sample value as described in nvmlSampleVal_t ++ * @param sampleCount Reference to provide the number of elements which can be queried in samples array ++ * @param samples Reference in which samples are returned ++ ++ * @return ++ * - \ref NVML_SUCCESS if samples are successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a samplesCount is NULL or ++ * reference to \a sampleCount is 0 for non null \a samples ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, ++ nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples); ++ ++/** ++ * Gets Total, Available and Used size of BAR1 memory. ++ * ++ * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party ++ * devices (peer-to-peer on the PCIE bus). ++ * ++ * @note In MIG mode, if device handle is provided, the API returns aggregate ++ * information, only if the caller has appropriate privileges. Per-instance ++ * information can be queried by using specific MIG device handles. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param bar1Memory Reference in which BAR1 memory ++ * information is returned. ++ * ++ * @return ++ * - \ref NVML_SUCCESS if BAR1 memory is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a bar1Memory is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory); ++ ++/** ++ * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power ++ * or thermal constraints. ++ * ++ * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The ++ * difference in violation times at two different reference times gives the indication of GPU throttling event. ++ * ++ * Violation for thermal capping is not supported at this time. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param perfPolicyType Represents Performance policy which can trigger GPU throttling ++ * @param violTime Reference to which violation time related information is returned ++ * ++ * ++ * @return ++ * - \ref NVML_SUCCESS if violation time is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime); ++ ++/** ++ * Gets the device's interrupt number ++ * ++ * @param device The identifier of the target device ++ * @param irqNum The interrupt number associated with the specified device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if irq number is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a irqNum is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetIrqNum(nvmlDevice_t device, unsigned int *irqNum); ++ ++/** ++ * Gets the device's core count ++ * ++ * @param device The identifier of the target device ++ * @param numCores The number of cores for the specified device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if Gpu core count is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a numCores is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNumGpuCores(nvmlDevice_t device, unsigned int *numCores); ++ ++/** ++ * Gets the devices power source ++ * ++ * @param device The identifier of the target device ++ * @param powerSource The power source of the device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the current power source was successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a powerSource is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPowerSource(nvmlDevice_t device, nvmlPowerSource_t *powerSource); ++ ++/** ++ * Gets the device's memory bus width ++ * ++ * @param device The identifier of the target device ++ * @param busWidth The devices's memory bus width ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the memory bus width is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a busWidth is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMemoryBusWidth(nvmlDevice_t device, unsigned int *busWidth); ++ ++/** ++ * Gets the device's PCIE Max Link speed in MBPS ++ * ++ * @param device The identifier of the target device ++ * @param maxSpeed The devices's PCIE Max Link speed in MBPS ++ * ++ * @return ++ * - \ref NVML_SUCCESS if Pcie Max Link Speed is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a maxSpeed is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPcieLinkMaxSpeed(nvmlDevice_t device, unsigned int *maxSpeed); ++ ++/** ++ * Gets the device's PCIe Link speed in Mbps ++ * ++ * @param device The identifier of the target device ++ * @param pcieSpeed The devices's PCIe Max Link speed in Mbps ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pcieSpeed has been retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pcieSpeed is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support PCIe speed getting ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPcieSpeed(nvmlDevice_t device, unsigned int *pcieSpeed); ++ ++/** ++ * Gets the device's Adaptive Clock status ++ * ++ * @param device The identifier of the target device ++ * @param adaptiveClockStatus The current adaptive clocking status, either ++ * \p NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED ++ * or \p NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the current adaptive clocking status is successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a adaptiveClockStatus is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAdaptiveClockInfoStatus(nvmlDevice_t device, unsigned int *adaptiveClockStatus); ++ ++/** ++ * Get the type of the GPU Bus (PCIe, PCI, ...) ++ * ++ * @param device The identifier of the target device ++ * @param type The PCI Bus type ++ * ++ * return ++ * - \ref NVML_SUCCESS if the bus \a type is successfully retreived ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a type is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t *type); ++ ++ ++ /** ++ * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceGetGpuFabricInfoV instead ++ * ++ * Get fabric information associated with the device. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * On Hopper + NVSwitch systems, GPU is registered with the NVIDIA Fabric Manager ++ * Upon successful registration, the GPU is added to the NVLink fabric to enable ++ * peer-to-peer communication. ++ * This API reports the current state of the GPU in the NVLink fabric ++ * along with other useful information. ++ * ++ * ++ * @param device The identifier of the target device ++ * @param gpuFabricInfo Information about GPU fabric state ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support gpu fabric ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t *gpuFabricInfo); ++ ++/** ++* Versioned wrapper around \ref nvmlDeviceGetGpuFabricInfo that accepts a versioned ++* \ref nvmlGpuFabricInfo_v2_t or later output structure. ++* ++* @note The caller must set the \ref nvmlGpuFabricInfoV_t.version field to the ++* appropriate version prior to calling this function. For example: ++* \code ++* nvmlGpuFabricInfoV_t fabricInfo = ++* { .version = nvmlGpuFabricInfo_v2 }; ++* nvmlReturn_t result = nvmlDeviceGetGpuFabricInfoV(device,&fabricInfo); ++* \endcode ++* ++* For Hopper &tm; or newer fully supported devices. ++* ++* @param device The identifier of the target device ++* @param gpuFabricInfo Information about GPU fabric state ++* ++* @return ++* - \ref NVML_SUCCESS Upon success ++* - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support gpu fabric ++*/ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, ++ nvmlGpuFabricInfoV_t *gpuFabricInfo); ++ ++/** ++ * Set new power limit of this device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Requires root/admin permissions. ++ * ++ * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. ++ * ++ * See \ref nvmlPowerValue_v2_t for more information on the struct. ++ * ++ * \note Limit is not persistent across reboots or driver unloads. ++ * Enable persistent mode to prevent driver from unloading when no application is using the device. ++ * ++ * This API replaces nvmlDeviceSetPowerManagementLimit. It can be used as a drop-in replacement for the older version. ++ * ++ * @param device The identifier of the target device ++ * @param powerValue Power management limit in milliwatts to set ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a limit has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a powerValue is NULL or contains invalid values ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see NVML_FI_DEV_POWER_AVERAGE ++ * @see NVML_FI_DEV_POWER_INSTANT ++ * @see NVML_FI_DEV_POWER_MIN_LIMIT ++ * @see NVML_FI_DEV_POWER_MAX_LIMIT ++ * @see NVML_FI_DEV_POWER_CURRENT_LIMIT ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit_v2(nvmlDevice_t device, nvmlPowerValue_v2_t *powerValue); ++ ++/** ++ * Get SRAM ECC error status of this device. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Requires root/admin permissions. ++ * ++ * See \ref nvmlEccSramErrorStatus_v1_t for more information on the struct. ++ * ++ * @param device The identifier of the target device ++ * @param status Returns SRAM ECC error status ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a limit has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counters is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version of \a nvmlEccSramErrorStatus_t is invalid ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSramEccErrorStatus(nvmlDevice_t device, ++ nvmlEccSramErrorStatus_t *status); ++ ++/** ++ * Get Conf Computing System capabilities. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param capabilities System CC capabilities ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a capabilities were successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a capabilities is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeCapabilities(nvmlConfComputeSystemCaps_t *capabilities); ++ ++/** ++ * Get Conf Computing System State. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param state System CC State ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a state were successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a state is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeState(nvmlConfComputeSystemState_t *state); ++ ++/** ++ * Get Conf Computing Protected and Unprotected Memory Sizes. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param device Device handle ++ * @param memInfo Protected/Unprotected Memory sizes ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a memInfo were successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a memInfo or \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeMemSizeInfo(nvmlDevice_t device, nvmlConfComputeMemSizeInfo_t *memInfo); ++ ++/** ++ * Get Conf Computing GPUs ready state. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param isAcceptingWork Returns GPU current work accepting state, ++ * NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or ++ * NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE ++ * ++ * return ++ * - \ref NVML_SUCCESS if \a current GPUs ready state were successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a isAcceptingWork is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeGpusReadyState(unsigned int *isAcceptingWork); ++ ++/** ++ * Get Conf Computing protected memory usage. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param device The identifier of the target device ++ * @param memory Reference in which to return the memory information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a memory has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeProtectedMemoryUsage(nvmlDevice_t device, nvmlMemory_t *memory); ++ ++/** ++ * Get Conf Computing Gpu certificate details. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param device The identifier of the target device ++ * @param gpuCert Reference in which to return the gpu certificate information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a gpu certificate info has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuCertificate(nvmlDevice_t device, ++ nvmlConfComputeGpuCertificate_t *gpuCert); ++ ++/** ++ * Get Conf Computing Gpu attestation report. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param device The identifier of the target device ++ * @param gpuAtstReport Reference in which to return the gpu attestation report ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a gpu attestation report has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuAttestationReport(nvmlDevice_t device, ++ nvmlConfComputeGpuAttestationReport_t *gpuAtstReport); ++/** ++ * Get Conf Computing key rotation threshold detail. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param pKeyRotationThrInfo Reference in which to return the key rotation threshold data ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a gpu key rotation threshold info has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeKeyRotationThresholdInfo( ++ nvmlConfComputeGetKeyRotationThresholdInfo_t *pKeyRotationThrInfo); ++ ++/** ++ * Set Conf Computing Unprotected Memory Size. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param device Device Handle ++ * @param sizeKiB Unprotected Memory size to be set in KiB ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a sizeKiB successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetConfComputeUnprotectedMemSize(nvmlDevice_t device, unsigned long long sizeKiB); ++ ++/** ++ * Set Conf Computing GPUs ready state. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param isAcceptingWork GPU accepting new work, NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or ++ * NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE ++ * ++ * return ++ * - \ref NVML_SUCCESS if \a current GPUs ready state is successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a isAcceptingWork is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlSystemSetConfComputeGpusReadyState(unsigned int isAcceptingWork); ++ ++/** ++ * Set Conf Computing key rotation threshold. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * This function is to set the confidential compute key rotation threshold parameters. ++ * \a pKeyRotationThrInfo->maxAttackerAdvantage should be in the range from ++ * NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MIN to NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX. ++ * Default value is 60. ++ * ++ * @param pKeyRotationThrInfo Reference to the key rotation threshold data ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a key rotation threashold max attacker advantage has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL ++ * - \ref NVML_ERROR_INVALID_STATE if confidential compute GPU ready state is enabled ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlSystemSetConfComputeKeyRotationThresholdInfo( ++ nvmlConfComputeSetKeyRotationThresholdInfo_t *pKeyRotationThrInfo); ++ ++/** ++ * Get Conf Computing System Settings. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param settings System CC settings ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the query is success ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counters is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetConfComputeSettings(nvmlSystemConfComputeSettings_t *settings); ++ ++/** ++ * Retrieve GSP firmware version. ++ * ++ * The caller passes in buffer via \a version and corresponding GSP firmware numbered version ++ * is returned with the same parameter in string format. ++ * ++ * @param device Device handle ++ * @param version The retrieved GSP firmware version ++ * ++ * @return ++ * - \ref NVML_SUCCESS if GSP firmware version is sucessfully retrieved ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or GSP \a version pointer is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if GSP firmware is not enabled for GPU ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareVersion(nvmlDevice_t device, char *version); ++ ++/** ++ * Retrieve GSP firmware mode. ++ * ++ * The caller passes in integer pointers. GSP firmware enablement and default mode information is returned with ++ * corresponding parameters. The return value in \a isEnabled and \a defaultMode should be treated as boolean. ++ * ++ * @param device Device handle ++ * @param isEnabled Pointer to specify if GSP firmware is enabled ++ * @param defaultMode Pointer to specify if GSP firmware is supported by default on \a device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if GSP firmware mode is sucessfully retrieved ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or any of \a isEnabled or \a defaultMode is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if GSP firmware is not enabled for GPU ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareMode(nvmlDevice_t device, unsigned int *isEnabled, unsigned int *defaultMode); ++ ++/** ++ * @} ++ */ ++ ++/** @addtogroup nvmlAccountingStats ++ * @{ ++ */ ++ ++/** ++ * Queries the state of per process accounting mode. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlDeviceGetAccountingStats for more details. ++ * See \ref nvmlDeviceSetAccountingMode ++ * ++ * @param device The identifier of the target device ++ * @param mode Reference in which to return the current accounting mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the mode has been successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode); ++ ++/** ++ * Queries process's accounting stats. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. ++ * Accounting stats can be queried during life time of the process and after its termination. ++ * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and ++ * updated to actual running time after its termination. ++ * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old ++ * processes. ++ * ++ * See \ref nvmlAccountingStats_t for description of each returned metric. ++ * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids. ++ * ++ * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode. ++ * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be ++ * queried since they don't contribute to GPU utilization. ++ * @note In case of pid collision stats of only the latest process (that terminated last) will be reported ++ * ++ * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. ++ * ++ * @param device The identifier of the target device ++ * @param pid Process Id of the target process to query stats for ++ * @param stats Reference in which to return the process's accounting stats ++ * ++ * @return ++ * - \ref NVML_SUCCESS if stats have been successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL ++ * - \ref NVML_ERROR_NOT_FOUND if process stats were not found ++ * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled ++ * or on vGPU host. ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetAccountingBufferSize ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); ++ ++/** ++ * Queries list of processes that can be queried for accounting stats. The list of processes returned ++ * can be in running or terminated state. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * To just query the number of processes ready to be queried, call this function with *count = 0 and ++ * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. ++ * ++ * For more details see \ref nvmlDeviceGetAccountingStats. ++ * ++ * @note In case of PID collision some processes might not be accessible before the circular buffer is full. ++ * ++ * @param device The identifier of the target device ++ * @param count Reference in which to provide the \a pids array size, and ++ * to return the number of elements ready to be queried ++ * @param pids Reference in which to return list of process ids ++ * ++ * @return ++ * - \ref NVML_SUCCESS if pids were successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled ++ * or on vGPU host. ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to ++ * expected value) ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetAccountingBufferSize ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids); ++ ++/** ++ * Returns the number of processes that the circular buffer with accounting pids can hold. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * This is the maximum number of processes that accounting information will be stored for before information ++ * about oldest processes will get overwritten by information about new processes. ++ * ++ * @param device The identifier of the target device ++ * @param bufferSize Reference in which to provide the size (in number of elements) ++ * of the circular buffer for accounting stats. ++ * ++ * @return ++ * - \ref NVML_SUCCESS if buffer size was successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetAccountingStats ++ * @see nvmlDeviceGetAccountingPids ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize); ++ ++/** @} */ ++ ++/** @addtogroup nvmlDeviceQueries ++ * @{ ++ */ ++ ++/** ++ * Returns the list of retired pages by source, including pages that are pending retirement ++ * The address information provided from this API is the hardware address of the page that was retired. Note ++ * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param cause Filter page addresses by cause of retirement ++ * @param pageCount Reference in which to provide the \a addresses buffer size, and ++ * to return the number of retired pages that match \a cause ++ * Set to 0 to query the size without allocating an \a addresses buffer ++ * @param addresses Buffer to write the page addresses into ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the ++ * matching page addresses. \a pageCount is set to the needed size. ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or ++ * \a addresses is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, ++ unsigned int *pageCount, unsigned long long *addresses); ++ ++/** ++ * Returns the list of retired pages by source, including pages that are pending retirement ++ * The address information provided from this API is the hardware address of the page that was retired. Note ++ * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 ++ * ++ * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps parameter to return the time of each page's ++ * retirement. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param cause Filter page addresses by cause of retirement ++ * @param pageCount Reference in which to provide the \a addresses buffer size, and ++ * to return the number of retired pages that match \a cause ++ * Set to 0 to query the size without allocating an \a addresses buffer ++ * @param addresses Buffer to write the page addresses into ++ * @param timestamps Buffer to write the timestamps of page retirement, additional for _v2 ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the ++ * matching page addresses. \a pageCount is set to the needed size. ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or ++ * \a addresses is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetirementCause_t cause, ++ unsigned int *pageCount, unsigned long long *addresses, unsigned long long *timestamps); ++ ++/** ++ * Check if any pages are pending retirement and need a reboot to fully retire. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param isPending Reference in which to return the pending status ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a isPending was populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isPending is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending); ++ ++/** ++ * Get number of remapped rows. The number of rows reported will be based on ++ * the cause of the remapping. isPending indicates whether or not there are ++ * pending remappings. A reset will be required to actually remap the row. ++ * failureOccurred will be set if a row remapping ever failed in the past. A ++ * pending remapping won't affect future work on the GPU since ++ * error-containment and dynamic page blacklisting will take care of that. ++ * ++ * @note On MIG-enabled GPUs with active instances, querying the number of ++ * remapped rows is not supported ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param corrRows Reference for number of rows remapped due to correctable errors ++ * @param uncRows Reference for number of rows remapped due to uncorrectable errors ++ * @param isPending Reference for whether or not remappings are pending ++ * @param failureOccurred Reference that is set when a remapping has failed in the past ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a corrRows, \a uncRows, \a isPending or \a failureOccurred is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If MIG is enabled or if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN Unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetRemappedRows(nvmlDevice_t device, unsigned int *corrRows, unsigned int *uncRows, ++ unsigned int *isPending, unsigned int *failureOccurred); ++ ++/** ++ * Get the row remapper histogram. Returns the remap availability for each bank ++ * on the GPU. ++ * ++ * @param device Device handle ++ * @param values Histogram values ++ * ++ * @return ++ * - \ref NVML_SUCCESS On success ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetRowRemapperHistogram(nvmlDevice_t device, nvmlRowRemapperHistogramValues_t *values); ++ ++/** ++ * Get architecture for device ++ * ++ * @param device The identifier of the target device ++ * @param arch Reference where architecture is returned, if call successful. ++ * Set to NVML_DEVICE_ARCH_* upon success ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a arch (output refererence) are invalid ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetArchitecture(nvmlDevice_t device, nvmlDeviceArchitecture_t *arch); ++ ++/** ++ * Retrieves the frequency monitor fault status for the device. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Requires root user. ++ * ++ * See \ref nvmlClkMonStatus_t for details on decoding the status output. ++ * ++ * @param device The identifier of the target device ++ * @param status Reference in which to return the clkmon fault status ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a status has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a status is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetClkMonStatus() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetClkMonStatus(nvmlDevice_t device, nvmlClkMonStatus_t *status); ++ ++/** ++ * Retrieves the current utilization and process ID ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. ++ * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at ++ * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization ++ * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values ++ * are returned as "unsigned int" values. If no valid sample entries are found since the lastSeenTimeStamp, NVML_ERROR_NOT_FOUND ++ * is returned. ++ * ++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with ++ * \a utilization set to NULL. The caller should allocate a buffer of size ++ * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed ++ * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. ++ * ++ * On successful return, the function updates \a processSamplesCount with the number of process utilization sample ++ * structures that were actually written. This may differ from a previously read value as instances are created or ++ * destroyed. ++ * ++ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 ++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp ++ * to a timeStamp retrieved from a previous query to read utilization since the previous query. ++ * ++ * @note On MIG-enabled GPUs, querying process utilization is not currently supported. ++ * ++ * @param device The identifier of the target device ++ * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned ++ * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running ++ * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. ++ ++ * @return ++ * - \ref NVML_SUCCESS if \a utilization has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, ++ unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); ++ ++/** ++ * Retrieves the recent utilization and process ID for all running processes ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder, jpeg decoder, OFA (Optical Flow Accelerator) ++ * for all running processes. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at ++ * by \a procesesUtilInfo->procUtilArray. One utilization sample structure is returned per process running, that had some non-zero utilization ++ * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values ++ * are returned as "unsigned int" values. ++ * ++ * The caller should allocate a buffer of size processSamplesCount * sizeof(nvmlProcessUtilizationInfo_t). If the buffer is too small, the API will ++ * return \a NVML_ERROR_INSUFFICIENT_SIZE, with the recommended minimal buffer size at \a procesesUtilInfo->processSamplesCount. The caller should ++ * invoke the function again with the allocated buffer passed in \a procesesUtilInfo->procUtilArray, and \a procesesUtilInfo->processSamplesCount ++ * set to the number no less than the recommended value by the previous API return. ++ * ++ * On successful return, the function updates \a procesesUtilInfo->processSamplesCount with the number of process utilization info structures ++ * that were actually written. This may differ from a previously read value as instances are created or destroyed. ++ * ++ * \a procesesUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 ++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set \a procesesUtilInfo->lastSeenTimeStamp ++ * to a timeStamp retrieved from a previous query to read utilization since the previous query. ++ * ++ * \a procesesUtilInfo->version is the version number of the structure nvmlProcessesUtilizationInfo_t, the caller should set the correct version ++ * number to retrieve the specific version of processes utilization information. ++ * ++ * @note On MIG-enabled GPUs, querying process utilization is not currently supported. ++ * ++ * @param device The identifier of the target device ++ * @param procesesUtilInfo Pointer to the caller-provided structure of nvmlProcessesUtilizationInfo_t. ++ ++ * @return ++ * - \ref NVML_SUCCESS if \a procesesUtilInfo->procUtilArray has been populated ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a procesesUtilInfo is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version of \a procesesUtilInfo is invalid ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a procesesUtilInfo->procUtilArray is NULL, or the buffer size of procesesUtilInfo->procUtilArray is too small. ++ * The caller should check the minimul array size from the returned procesesUtilInfo->processSamplesCount, and call ++ * the function again with a buffer no smaller than procesesUtilInfo->processSamplesCount * sizeof(nvmlProcessUtilizationInfo_t) ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetProcessesUtilizationInfo(nvmlDevice_t device, nvmlProcessesUtilizationInfo_t *procesesUtilInfo); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlUnitCommands Unit Commands ++ * This chapter describes NVML operations that change the state of the unit. For S-class products. ++ * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION ++ * error code when invoking any of these methods. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Set the LED state for the unit. The LED can be either green (0) or amber (1). ++ * ++ * For S-class products. ++ * Requires root/admin permissions. ++ * ++ * This operation takes effect immediately. ++ * ++ * ++ * Current S-Class products don't provide unique LEDs for each unit. As such, both front ++ * and back LEDs will be toggled in unison regardless of which unit is specified with this command. ++ * ++ * See \ref nvmlLedColor_t for available colors. ++ * ++ * @param unit The identifier of the target unit ++ * @param color The target LED color ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the LED color has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlUnitGetLedState() ++ */ ++nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlDeviceCommands Device Commands ++ * This chapter describes NVML operations that change the state of the device. ++ * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION ++ * error code when invoking any of these methods. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Set the persistence mode for the device. ++ * ++ * For all products. ++ * For Linux only. ++ * Requires root/admin permissions. ++ * ++ * The persistence mode determines whether the GPU driver software is torn down after the last client ++ * exits. ++ * ++ * This operation takes effect immediately. It is not persistent across reboots. After each reboot the ++ * persistence mode is reset to "Disabled". ++ * ++ * See \ref nvmlEnableState_t for available modes. ++ * ++ * After calling this API with mode set to NVML_FEATURE_DISABLED on a device that has its own NUMA ++ * memory, the given device handle will no longer be valid, and to continue to interact with this ++ * device, a new handle should be obtained from one of the nvmlDeviceGetHandleBy*() APIs. This ++ * limitation is currently only applicable to devices that have a coherent NVLink connection to ++ * system memory. ++ * ++ * @param device The identifier of the target device ++ * @param mode The target persistence mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the persistence mode was set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetPersistenceMode() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode); ++ ++/** ++ * Set the compute mode for the device. ++ * ++ * For all products. ++ * Requires root/admin permissions. ++ * ++ * The compute mode determines whether a GPU can be used for compute operations and whether it can ++ * be shared across contexts. ++ * ++ * This operation takes effect immediately. Under Linux it is not persistent across reboots and ++ * always resets to "Default". Under windows it is persistent. ++ * ++ * Under windows compute mode may only be set to DEFAULT when running in WDDM ++ * ++ * @note On MIG-enabled GPUs, compute mode would be set to DEFAULT and changing it is not supported. ++ * ++ * See \ref nvmlComputeMode_t for details on available compute modes. ++ * ++ * @param device The identifier of the target device ++ * @param mode The target compute mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the compute mode was set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetComputeMode() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode); ++ ++/** ++ * Set the ECC mode for the device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Only applicable to devices with ECC. ++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher. ++ * Requires root/admin permissions. ++ * ++ * The ECC mode determines whether the GPU enables its ECC support. ++ * ++ * This operation takes effect after the next reboot. ++ * ++ * See \ref nvmlEnableState_t for details on available modes. ++ * ++ * @param device The identifier of the target device ++ * @param ecc The target ECC mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the ECC mode was set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetEccMode() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); ++ ++/** ++ * Clear the ECC error and other memory error counts for the device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Only applicable to devices with ECC. ++ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts. ++ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts. ++ * Requires root/admin permissions. ++ * Requires ECC Mode to be enabled. ++ * ++ * Sets all of the specified ECC counters to 0, including both detailed and total counts. ++ * ++ * This operation takes effect immediately. ++ * ++ * See \ref nvmlMemoryErrorType_t for details on available counter types. ++ * ++ * @param device The identifier of the target device ++ * @param counterType Flag that indicates which type of errors should be cleared. ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the error counts were cleared ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see ++ * - nvmlDeviceGetDetailedEccErrors() ++ * - nvmlDeviceGetTotalEccErrors() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType); ++ ++/** ++ * Set the driver model for the device. ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * For windows only. ++ * Requires root/admin permissions. ++ * ++ * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached ++ * to the device it must run in WDDM mode. ++ * ++ * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce). ++ * This should only be done if the host is subsequently powered down and the display is detached from the device ++ * before the next reboot. ++ * ++ * This operation takes effect after the next reboot. ++ * ++ * Windows driver model may only be set to WDDM when running in DEFAULT compute mode. ++ * ++ * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or ++ * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode. ++ * ++ * See \ref nvmlDriverModel_t for details on available driver models. ++ * See \ref nvmlFlagDefault and \ref nvmlFlagForce ++ * ++ * @param device The identifier of the target device ++ * @param driverModel The target driver model ++ * @param flags Flags that change the default behavior ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the driver model has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetDriverModel() ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags); ++ ++typedef enum nvmlClockLimitId_enum { ++ NVML_CLOCK_LIMIT_ID_RANGE_START = 0xffffff00, ++ NVML_CLOCK_LIMIT_ID_TDP, ++ NVML_CLOCK_LIMIT_ID_UNLIMITED ++} nvmlClockLimitId_t; ++ ++/** ++ * Set clocks that device will lock to. ++ * ++ * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz. ++ * Setting this will supersede application clock values and take effect regardless if a cuda app is running. ++ * See /ref nvmlDeviceSetApplicationsClocks ++ * ++ * Can be used as a setting to request constant performance. ++ * ++ * This can be called with a pair of integer clock frequencies in MHz, or a pair of /ref nvmlClockLimitId_t values. ++ * See the table below for valid combinations of these values. ++ * ++ * minGpuClock | maxGpuClock | Effect ++ * ------------+-------------+-------------------------------------------------- ++ * tdp | tdp | Lock clock to TDP ++ * unlimited | tdp | Upper bound is TDP but clock may drift below this ++ * tdp | unlimited | Lower bound is TDP but clock may boost above this ++ * unlimited | unlimited | Unlocked (== nvmlDeviceResetGpuLockedClocks) ++ * ++ * If one arg takes one of these values, the other must be one of these values as ++ * well. Mixed numeric and symbolic calls return NVML_ERROR_INVALID_ARGUMENT. ++ * ++ * Requires root/admin permissions. ++ * ++ * After system reboot or driver reload applications clocks go back to their default value. ++ * See \ref nvmlDeviceResetGpuLockedClocks. ++ * ++ * For Volta &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param minGpuClockMHz Requested minimum gpu clock in MHz ++ * @param maxGpuClockMHz Requested maximum gpu clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if new settings were successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz ++ * is not a valid clock combination ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minGpuClockMHz, unsigned int maxGpuClockMHz); ++ ++/** ++ * Resets the gpu clock to the default value ++ * ++ * This is the gpu clock that will be used after system reboot or driver reload. ++ * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. ++ * ++ * @see nvmlDeviceSetGpuLockedClocks ++ * ++ * For Volta &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if new settings were successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device); ++ ++/** ++ * Set memory clocks that device will lock to. ++ * ++ * Sets the device's memory clocks to the value in the range of minMemClockMHz to maxMemClockMHz. ++ * Setting this will supersede application clock values and take effect regardless of whether a cuda app is running. ++ * See /ref nvmlDeviceSetApplicationsClocks ++ * ++ * Can be used as a setting to request constant performance. ++ * ++ * Requires root/admin permissions. ++ * ++ * After system reboot or driver reload applications clocks go back to their default value. ++ * See \ref nvmlDeviceResetMemoryLockedClocks. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param minMemClockMHz Requested minimum memory clock in MHz ++ * @param maxMemClockMHz Requested maximum memory clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if new settings were successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz ++ * is not a valid clock combination ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsigned int minMemClockMHz, unsigned int maxMemClockMHz); ++ ++/** ++ * Resets the memory clock to the default value ++ * ++ * This is the memory clock that will be used after system reboot or driver reload. ++ * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. ++ * ++ * @see nvmlDeviceSetMemoryLockedClocks ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if new settings were successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceResetMemoryLockedClocks(nvmlDevice_t device); ++ ++/** ++ * Set clocks that applications will lock to. ++ * ++ * Sets the clocks that compute and graphics applications will be running at. ++ * e.g. CUDA driver requests these clocks during context creation which means this property ++ * defines clocks at which CUDA applications will be running unless some overspec event ++ * occurs (e.g. over power, over thermal or external HW brake). ++ * ++ * Can be used as a setting to request constant performance. ++ * ++ * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. ++ * ++ * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call ++ * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting ++ * above the clock value being set. ++ * ++ * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. ++ * Requires root/admin permissions. ++ * ++ * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks ++ * for details on how to list available clocks combinations. ++ * ++ * After system reboot or driver reload applications clocks go back to their default value. ++ * See \ref nvmlDeviceResetApplicationsClocks. ++ * ++ * @param device The identifier of the target device ++ * @param memClockMHz Requested memory clock in MHz ++ * @param graphicsClockMHz Requested graphics clock in MHz ++ * ++ * @return ++ * - \ref NVML_SUCCESS if new settings were successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz ++ * is not a valid clock combination ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); ++ ++/** ++ * Resets the application clock to the default value ++ * ++ * This is the applications clock that will be used after system reboot or driver reload. ++ * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. ++ * ++ * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, ++ * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above ++ * base clocks as thermal limits allow. ++ * ++ * @see nvmlDeviceGetApplicationsClock ++ * @see nvmlDeviceSetApplicationsClocks ++ * ++ * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. ++ * ++ * @param device The identifier of the target device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if new settings were successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); ++ ++/** ++ * Try to set the current state of Auto Boosted clocks on a device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates ++ * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock ++ * rates are desired. ++ * ++ * Non-root users may use this API by default but can be restricted by root from using this API by calling ++ * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. ++ * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. ++ * ++ * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. ++ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost ++ * behavior. ++ * ++ * @param device The identifier of the target device ++ * @param enabled What state to try to set Auto Boosted clocks of the target device to ++ * ++ * @return ++ * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled); ++ ++/** ++ * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will ++ * return to when no compute running processes (e.g. CUDA application which have an active context) are running ++ * ++ * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. ++ * Requires root/admin permissions. ++ * ++ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates ++ * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock ++ * rates are desired. ++ * ++ * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. ++ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost ++ * behavior. ++ * ++ * @param device The identifier of the target device ++ * @param enabled What state to try to set default Auto Boosted clocks of the target device to ++ * @param flags Flags that change the default behavior. Currently Unused. ++ * ++ * @return ++ * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state. ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags); ++ ++/** ++ * Sets the speed of the fan control policy to default. ++ * ++ * For all cuda-capable discrete products with fans ++ * ++ * @param device The identifier of the target device ++ * @param fan The index of the fan, starting at zero ++ * ++ * return ++ * NVML_SUCCESS if speed has been adjusted ++ * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * NVML_ERROR_INVALID_ARGUMENT if device is invalid ++ * NVML_ERROR_NOT_SUPPORTED if the device does not support this ++ * (doesn't have fans) ++ * NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice_t device, unsigned int fan); ++ ++/** ++ * Sets current fan control policy. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Requires privileged user. ++ * ++ * For all cuda-capable discrete products with fans ++ * ++ * device The identifier of the target \a device ++ * policy The fan control \a policy to set ++ * ++ * return ++ * NVML_SUCCESS if \a policy has been set ++ * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a policy is null or the \a fan given doesn't reference ++ * a fan that exists. ++ * NVML_ERROR_NOT_SUPPORTED if the \a device is older than Maxwell ++ * NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetFanControlPolicy(nvmlDevice_t device, unsigned int fan, ++ nvmlFanControlPolicy_t policy); ++ ++/** ++ * Sets the temperature threshold for the GPU with the specified threshold type in degrees C. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. ++ * ++ * @param device The identifier of the target device ++ * @param thresholdType The type of threshold value to be set ++ * @param temp Reference which hold the value to be set ++ * @return ++ * - \ref NVML_SUCCESS if \a temp has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, int *temp); ++ ++/** ++ * Set new power limit of this device. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Requires root/admin permissions. ++ * ++ * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. ++ * ++ * \note Limit is not persistent across reboots or driver unloads. ++ * Enable persistent mode to prevent driver from unloading when no application is using the device. ++ * ++ * @param device The identifier of the target device ++ * @param limit Power management limit in milliwatts to set ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a limit has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceGetPowerManagementLimitConstraints ++ * @see nvmlDeviceGetPowerManagementDefaultLimit ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit); ++ ++/** ++ * Sets new GOM. See \a nvmlGpuOperationMode_t for details. ++ * ++ * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. ++ * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. ++ * Not supported on Quadro ® and Tesla &tm; C-class products. ++ * Requires root/admin permissions. ++ * ++ * Changing GOMs requires a reboot. ++ * The reboot requirement might be removed in the future. ++ * ++ * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when ++ * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel. ++ * ++ * @param device The identifier of the target device ++ * @param mode Target GOM ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a mode has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support GOM or specific mode ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlGpuOperationMode_t ++ * @see nvmlDeviceGetGpuOperationMode ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode); ++ ++/** ++ * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs. ++ * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs. ++ * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction ++ * to query the current restriction settings. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Requires root/admin permissions. ++ * ++ * @param device The identifier of the target device ++ * @param apiType Target API type for this operation ++ * @param isRestricted The target restriction ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a isRestricted has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support ++ * the feature that api restrictions are being set for (E.G. Enabling/disabling auto ++ * boosted clocks is not supported by the device) ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlRestrictedAPI_t ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted); ++ ++/** ++ * Sets the speed of a specified fan. ++ * ++ * WARNING: This function changes the fan control policy to manual. It means that YOU have to monitor ++ * the temperature and adjust the fan speed accordingly. ++ * If you set the fan speed too low you can burn your GPU! ++ * Use nvmlDeviceSetDefaultFanSpeed_v2 to restore default control policy. ++ * ++ * For all cuda-capable discrete products with fans that are Maxwell or Newer. ++ * ++ * device The identifier of the target device ++ * fan The index of the fan, starting at zero ++ * speed The target speed of the fan [0-100] in % of max speed ++ * ++ * return ++ * NVML_SUCCESS if the fan speed has been set ++ * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * NVML_ERROR_INVALID_ARGUMENT if the device is not valid, or the speed is outside acceptable ranges, ++ * or if the fan index doesn't reference an actual fan. ++ * NVML_ERROR_NOT_SUPPORTED if the device is older than Maxwell. ++ * NVML_ERROR_UNKNOWN if there was an unexpected error. ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int speed); ++ ++/** ++ * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceSetClockOffsets instead. It works ++ * on Maxwell onwards GPU architectures. ++ * ++ * Set the GPCCLK VF offset value ++ * @param[in] device The identifier of the target device ++ * @param[in] offset The GPCCLK VF offset value to set ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a offset has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset); ++ ++/** ++ * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceSetClockOffsets instead. It works ++ * on Maxwell onwards GPU architectures. ++ * ++ * Set the MemClk (Memory Clock) VF offset value. It requires elevated privileges. ++ * @param[in] device The identifier of the target device ++ * @param[in] offset The MemClk VF offset value to set ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a offset has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset); ++ ++/** ++ * @} ++ */ ++ ++/** @addtogroup nvmlAccountingStats ++ * @{ ++ */ ++ ++/** ++ * Enables or disables per process accounting. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Requires root/admin permissions. ++ * ++ * @note This setting is not persistent and will default to disabled after driver unloads. ++ * Enable persistence mode to be sure the setting doesn't switch off to disabled. ++ * ++ * @note Enabling accounting mode has no negative impact on the GPU performance. ++ * ++ * @note Disabling accounting clears all accounting pids information. ++ * ++ * @note On MIG-enabled GPUs, accounting mode would be set to DISABLED and changing it is not supported. ++ * ++ * See \ref nvmlDeviceGetAccountingMode ++ * See \ref nvmlDeviceGetAccountingStats ++ * See \ref nvmlDeviceClearAccountingPids ++ * ++ * @param device The identifier of the target device ++ * @param mode The target accounting mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the new mode has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode); ++ ++/** ++ * Clears accounting information about all processes that have already terminated. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * Requires root/admin permissions. ++ * ++ * See \ref nvmlDeviceGetAccountingMode ++ * See \ref nvmlDeviceGetAccountingStats ++ * See \ref nvmlDeviceSetAccountingMode ++ * ++ * @param device The identifier of the target device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if accounting information has been cleared ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup NvLink NvLink Methods ++ * This chapter describes methods that NVML can perform on NVLINK enabled devices. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Retrieves the state of the device's NvLink for the link specified ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that ++ * the link is active and NVML_FEATURE_DISABLED indicates it ++ * is inactive ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a isActive has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); ++ ++/** ++ * Retrieves the version of the device's NvLink for the link specified ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * @param version Requested NvLink version ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a version has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version); ++ ++/** ++ * Retrieves the requested capability from the device's NvLink for the link specified ++ * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried ++ * The return value should be treated as a boolean. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried ++ * @param capResult A boolean for the queried capability indicating that feature is available ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a capResult has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, ++ nvmlNvLinkCapability_t capability, unsigned int *capResult); ++ ++/** ++ * Retrieves the PCI information for the remote node on a NvLink link ++ * Note: pciSubSystemId is not filled in this function and is indeterminate ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * @param pci \a nvmlPciInfo_t of the remote node for the specified link ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pci has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); ++ ++/** ++ * Retrieves the specified error counter value ++ * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * @param counter Specifies the NvLink counter to be queried ++ * @param counterValue Returned counter value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a counter has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link, ++ nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue); ++ ++/** ++ * Resets all error counters to zero ++ * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the reset is successful ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link); ++ ++/** ++ * Deprecated: Setting utilization counter control is no longer supported. ++ * ++ * Set the NVLINK utilization counter control information for the specified counter, 0 or 1. ++ * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset ++ * of the counters if the reset parameter is non-zero. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param counter Specifies the counter that should be set (0 or 1). ++ * @param link Specifies the NvLink link to be queried ++ * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set ++ * @param reset Resets the counters on set if non-zero ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the control has been set successfully ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, ++ nvmlNvLinkUtilizationControl_t *control, unsigned int reset); ++ ++/** ++ * Deprecated: Getting utilization counter control is no longer supported. ++ * ++ * Get the NVLINK utilization counter control information for the specified counter, 0 or 1. ++ * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param counter Specifies the counter that should be set (0 or 1). ++ * @param link Specifies the NvLink link to be queried ++ * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the control has been set successfully ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, ++ nvmlNvLinkUtilizationControl_t *control); ++ ++ ++/** ++ * Deprecated: Use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_NVLINK_THROUGHPUT_* as field values instead. ++ * ++ * Retrieve the NVLINK utilization counter based on the current control for a specified counter. ++ * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl ++ * before reading the utilization counters as they have no default state ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * @param counter Specifies the counter that should be read (0 or 1). ++ * @param rxcounter Receive counter return value ++ * @param txcounter Transmit counter return value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, ++ unsigned long long *rxcounter, unsigned long long *txcounter); ++ ++/** ++ * Deprecated: Freezing NVLINK utilization counters is no longer supported. ++ * ++ * Freeze the NVLINK utilization counters ++ * Both the receive and transmit counters are operated on by this function ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be queried ++ * @param counter Specifies the counter that should be frozen (0 or 1). ++ * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters ++ * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters ++ * ++ * @return ++ * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, ++ unsigned int counter, nvmlEnableState_t freeze); ++ ++/** ++ * Deprecated: Resetting NVLINK utilization counters is no longer supported. ++ * ++ * Reset the NVLINK utilization counters ++ * Both the receive and transmit counters are operated on by this function ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param link Specifies the NvLink link to be reset ++ * @param counter Specifies the counter that should be reset (0 or 1) ++ * ++ * @return ++ * - \ref NVML_SUCCESS if counters were successfully reset ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter); ++ ++/** ++* Get the NVLink device type of the remote device connected over the given link. ++* ++* @param device The device handle of the target GPU ++* @param link The NVLink link index on the target GPU ++* @param pNvLinkDeviceType Pointer in which the output remote device type is returned ++* ++* @return ++* - \ref NVML_SUCCESS if \a pNvLinkDeviceType has been set ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_NOT_SUPPORTED if NVLink is not supported ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid, or ++* \a pNvLinkDeviceType is NULL ++* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is ++* otherwise inaccessible ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType); ++ ++/** ++ * Set NvLink Low Power Threshold for device. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param info Reference to \a nvmlNvLinkPowerThres_t struct ++ * input parameters ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the \a Threshold is successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a Threshold is not within range ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * ++ **/ ++nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkDeviceLowPowerThreshold(nvmlDevice_t device, nvmlNvLinkPowerThres_t *info); ++ ++/** ++ * Set the global nvlink bandwith mode ++ * ++ * @param nvlinkBwMode nvlink bandwidth mode ++ * @return ++ * - \ref NVML_SUCCESS on success ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid argument is provided ++ * - \ref NVML_ERROR_IN_USE if P2P object exists ++ * - \ref NVML_ERROR_NOT_SUPPORTED if GPU is not Hopper or newer architecture. ++ * - \ref NVML_ERROR_NO_PERMISSION if not root user ++ */ ++nvmlReturn_t DECLDIR nvmlSystemSetNvlinkBwMode(unsigned int nvlinkBwMode); ++ ++/** ++ * Get the global nvlink bandwith mode ++ * ++ * @param nvlinkBwMode reference of nvlink bandwidth mode ++ * @return ++ * - \ref NVML_SUCCESS on success ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided ++ * - \ref NVML_ERROR_NOT_SUPPORTED if GPU is not Hopper or newer architecture. ++ * - \ref NVML_ERROR_NO_PERMISSION if not root user ++ */ ++nvmlReturn_t DECLDIR nvmlSystemGetNvlinkBwMode(unsigned int *nvlinkBwMode); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlEvents Event Handling Methods ++ * This chapter describes methods that NVML can perform against each device to register and wait for ++ * some event to occur. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Create an empty set of events. ++ * Event set should be freed by \ref nvmlEventSetFree ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * @param set Reference in which to return the event handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the event has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlEventSetFree ++ */ ++nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); ++ ++/** ++ * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors) ++ * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode) ++ * ++ * For Linux only. ++ * ++ * \b IMPORTANT: Operations on \a set are not thread safe ++ * ++ * This call starts recording of events on specific device. ++ * All events that occurred before this call are not recorded. ++ * Checking if some event occurred can be done with \ref nvmlEventSetWait_v2 ++ * ++ * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. ++ * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes ++ * are registered in that case. ++ * ++ * @param device The identifier of the target device ++ * @param eventTypes Bitmask of \ref nvmlEventType to record ++ * @param set Set to which add new event types ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the event has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlEventType ++ * @see nvmlDeviceGetSupportedEventTypes ++ * @see nvmlEventSetWait ++ * @see nvmlEventSetFree ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set); ++ ++/** ++ * Returns information about events supported on device ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows. ++ * ++ * @param device The identifier of the target device ++ * @param eventTypes Reference in which to return bitmask of supported events ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the eventTypes has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlEventType ++ * @see nvmlDeviceRegisterEvents ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes); ++ ++/** ++ * Waits on events and delivers events ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * If some events are ready to be delivered at the time of the call, function returns immediately. ++ * If there are no events ready to be delivered, function sleeps till event arrives ++ * but not longer than specified timeout. This function in certain conditions can return before ++ * specified timeout passes (e.g. when interrupt arrives) ++ * ++ * On Windows, in case of xid error, the function returns the most recent xid error type seen by the system. ++ * If there are multiple xid errors generated before nvmlEventSetWait is invoked then the last seen xid error ++ * type is returned for all xid error events. ++ * ++ * On Linux, every xid error event would return the associated event data and other information if applicable. ++ * ++ * In MIG mode, if device handle is provided, the API reports all the events for the available instances, ++ * only if the caller has appropriate privileges. In absence of required privileges, only the events which ++ * affect all the instances (i.e. whole device) are reported. ++ * ++ * This API does not currently support per-instance event reporting using MIG device handles. ++ * ++ * @param set Reference to set of events to wait on ++ * @param data Reference in which to return event data ++ * @param timeoutms Maximum amount of wait time in milliseconds for registered event ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the data has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL ++ * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived ++ * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlEventType ++ * @see nvmlDeviceRegisterEvents ++ */ ++nvmlReturn_t DECLDIR nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); ++ ++/** ++ * Releases events in the set ++ * ++ * For Fermi &tm; or newer fully supported devices. ++ * ++ * @param set Reference to events to be released ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the event has been successfully released ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlDeviceRegisterEvents ++ */ ++nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlZPI Drain states ++ * This chapter describes methods that NVML can perform against each device to control their drain state ++ * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to ++ * power on/off GPUs, enable robust reset scenarios, etc. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. ++ * Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before ++ * this call is made. ++ * Must be called as administrator. ++ * For Linux only. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * Some Kepler devices supported. ++ * ++ * @param pciInfo The PCI address of the GPU drain state to be modified ++ * @param newState The drain state that should be entered, see \ref nvmlEnableState_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS if counters were successfully reset ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation ++ * - \ref NVML_ERROR_IN_USE if the device has persistence mode turned on ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState); ++ ++/** ++ * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining ++ * state. ++ * For Linux only. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * Some Kepler devices supported. ++ * ++ * @param pciInfo The PCI address of the GPU drain state to be queried ++ * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS if counters were successfully reset ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState); ++ ++/** ++ * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver ++ * as long as no other processes are attached. If other processes are attached, this call will return ++ * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the ++ * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called ++ * to initiate the draining state is if that process was using, and is still using, a GPU before the ++ * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled ++ * prior to this call. ++ * ++ * For long-running NVML processes please note that this will change the enumeration of current GPUs. ++ * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2. ++ * Also, device handles after the removed GPU will not be valid and must be re-established. ++ * Must be run as administrator. ++ * For Linux only. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * Some Kepler devices supported. ++ * ++ * @param pciInfo The PCI address of the GPU to be removed ++ * @param gpuState Whether the GPU is to be removed, from the OS ++ * see \ref nvmlDetachGpuState_t ++ * @param linkState Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS if counters were successfully reset ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature ++ * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState); ++ ++/** ++ * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that ++ * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device. ++ * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes ++ * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order. ++ * ++ * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds ++ * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery. ++ * ++ * Must be run as administrator. ++ * For Linux only. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * Some Kepler devices supported. ++ * ++ * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device ++ * fields are used in this call. ++ * ++ * @return ++ * - \ref NVML_SUCCESS if counters were successfully reset ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the operating system does not support this feature ++ * - \ref NVML_ERROR_OPERATING_SYSTEM if the operating system is denying this feature ++ * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlFieldValueQueries Field Value Queries ++ * This chapter describes NVML operations that are associated with retrieving Field Values from NVML ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Request values for a list of fields for a device. This API allows multiple fields to be queried at once. ++ * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs ++ * will be populated from a single call rather than making a driver call for each fieldId. ++ * ++ * @param device The device handle of the GPU to request field values for ++ * @param valuesCount Number of entries in values that should be retrieved ++ * @param values Array of \a valuesCount structures to hold field values. ++ * Each value's fieldId must be populated prior to this call ++ * ++ * @return ++ * - \ref NVML_SUCCESS if any values in \a values were populated. Note that you must ++ * check the nvmlReturn field of each value for each individual ++ * status ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); ++ ++/** ++ * Clear values for a list of fields for a device. This API allows multiple fields to be cleared at once. ++ * ++ * @param device The device handle of the GPU to request field values for ++ * @param valuesCount Number of entries in values that should be cleared ++ * @param values Array of \a valuesCount structures to hold field values. ++ * Each value's fieldId must be populated prior to this call ++ * ++ * @return ++ * - \ref NVML_SUCCESS if any values in \a values were cleared. Note that you must ++ * check the nvmlReturn field of each value for each individual ++ * status ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceClearFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlVirtualGpuQueries vGPU APIs ++ * This chapter describes operations that are associated with NVIDIA vGPU Software products. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * This method is used to get the virtualization mode corresponding to the GPU. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device Identifier of the target device ++ * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_? ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a pVirtualMode is fetched ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode); ++ ++/** ++ * Queries if SR-IOV host operation is supported on a vGPU supported device. ++ * ++ * Checks whether SR-IOV host capability is supported by the device and the ++ * driver, and indicates device is in SR-IOV mode if both of these conditions ++ * are true. ++ * ++ * @param device The identifier of the target device ++ * @param pHostVgpuMode Reference in which to return the current vGPU mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if device's vGPU mode has been successfully retrieved ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle is 0 or \a pVgpuMode is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature. ++ * - \ref NVML_ERROR_UNKNOWN if any unexpected error occurred ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t *pHostVgpuMode); ++ ++/** ++ * This method is used to set the virtualization mode corresponding to the GPU. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device Identifier of the target device ++ * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a virtualMode is set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a virtualMode is NULL ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported. ++ * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client. ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); ++ ++/** ++ * Get the vGPU heterogeneous mode for the device. ++ * ++ * When in heterogeneous mode, a vGPU can concurrently host timesliced vGPUs with differing framebuffer sizes. ++ * ++ * On successful return, the function returns \a pHeterogeneousMode->mode with the current vGPU heterogeneous mode. ++ * \a pHeterogeneousMode->version is the version number of the structure nvmlVgpuHeterogeneousMode_t, the caller should ++ * set the correct version number to retrieve the vGPU heterogeneous mode. ++ * \a pHeterogeneousMode->mode can either be \ref NVML_FEATURE_ENABLED or \ref NVML_FEATURE_DISABLED. ++ * ++ * @param device The identifier of the target device ++ * @param pHeterogeneousMode Pointer to the caller-provided structure of nvmlVgpuHeterogeneousMode_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid or \a pHeterogeneousMode is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support this feature ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pHeterogeneousMode is invalid ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuHeterogeneousMode(nvmlDevice_t device, nvmlVgpuHeterogeneousMode_t *pHeterogeneousMode); ++ ++/** ++ * Enable or disable vGPU heterogeneous mode for the device. ++ * ++ * When in heterogeneous mode, a vGPU can concurrently host timesliced vGPUs with differing framebuffer sizes. ++ * ++ * API would return an appropriate error code upon unsuccessful activation. For example, the heterogeneous mode ++ * set will fail with error \ref NVML_ERROR_IN_USE if any vGPU instance is active on the device. The caller of this API ++ * is expected to shutdown the vGPU VMs and retry setting the \a mode. ++ * On successful return, the function updates the vGPU heterogeneous mode with the user provided \a pHeterogeneousMode->mode. ++ * \a pHeterogeneousMode->version is the version number of the structure nvmlVgpuHeterogeneousMode_t, the caller should ++ * set the correct version number to set the vGPU heterogeneous mode. ++ * ++ * @param device Identifier of the target device ++ * @param pHeterogeneousMode Pointer to the caller-provided structure of nvmlVgpuHeterogeneousMode_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a pHeterogeneousMode is NULL or \a pHeterogeneousMode->mode is invalid ++ * - \ref NVML_ERROR_IN_USE If the \a device is in use ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_NOT_SUPPORTED If MIG is enabled or \a device doesn't support this feature ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pHeterogeneousMode is invalid ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetVgpuHeterogeneousMode(nvmlDevice_t device, const nvmlVgpuHeterogeneousMode_t *pHeterogeneousMode); ++ ++/** ++ * Query the placement ID of active vGPU instance. ++ * ++ * When in vGPU heterogeneous mode, this function returns a valid placement ID as \a pPlacement->placementId ++ * else NVML_INVALID_VGPU_PLACEMENT_ID is returned. ++ * \a pPlacement->version is the version number of the structure nvmlVgpuPlacementId_t, the caller should ++ * set the correct version number to get placement id of the vGPU instance \a vgpuInstance. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param pPlacement Pointer to vGPU placement ID structure \a nvmlVgpuPlacementId_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS If information is successfully retrieved ++ * - \ref NVML_ERROR_NOT_FOUND If \a vgpuInstance does not match a valid active vGPU instance ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a vgpuInstance is invalid or \a pPlacement is NULL ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pPlacement is invalid ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetPlacementId(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuPlacementId_t *pPlacement); ++ ++/** ++ * Query the supported vGPU placement ID of the vGPU type. ++ * ++ * An array of supported vGPU placement IDs for the vGPU type ID indicated by \a vgpuTypeId is returned in the ++ * caller-supplied buffer of \a pPlacementList->placementIds. Memory needed for the placementIds array should be ++ * allocated based on maximum instances of a vGPU type which can be queried via \ref nvmlVgpuTypeGetMaxInstances(). ++ * ++ * This function will return supported placement IDs even if GPU is not in vGPU heterogeneous mode. ++ * ++ * @param device Identifier of the target device ++ * @param vgpuTypeId Handle to vGPU type. The vGPU type ID ++ * @param pPlacementList Pointer to the vGPU placement structure \a nvmlVgpuPlacementList_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a vgpuTypeId is invalid or \a pPlacementList is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device or \a vgpuTypeId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pPlacementList is invalid ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuTypeSupportedPlacements(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuPlacementList_t *pPlacementList); ++ ++/** ++ * Query the creatable vGPU placement ID of the vGPU type. ++ * ++ * An array of creatable vGPU placement IDs for the vGPU type ID indicated by \a vgpuTypeId is returned in the ++ * caller-supplied buffer of \a pPlacementList->placementIds. Memory needed for the placementIds array should be ++ * allocated based on maximum instances of a vGPU type which can be queried via \ref nvmlVgpuTypeGetMaxInstances(). ++ * The creatable vGPU placement IDs may differ over time, as there may be restrictions on what type of vGPU the ++ * vGPU instance is running. ++ * ++ * The function will return \ref NVML_ERROR_NOT_SUPPORTED if the \a device is not in vGPU heterogeneous mode. ++ * ++ * @param device The identifier of the target device ++ * @param vgpuTypeId Handle to vGPU type. The vGPU type ID ++ * @param pPlacementList Pointer to the list of vGPU placement structure \a nvmlVgpuPlacementList_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a vgpuTypeId is invalid or \a pPlacementList is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device or \a vgpuTypeId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pPlacementList is invalid ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuTypeCreatablePlacements(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuPlacementList_t *pPlacementList); ++ ++/** ++ * Retrieve the static GSP heap size of the vGPU type in bytes ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param gspHeapSize Reference to return the GSP heap size value ++ * @return ++ * - \ref NVML_SUCCESS Successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a vgpuTypeId is invalid, or \a gspHeapSize is NULL ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetGspHeapSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *gspHeapSize); ++ ++/** ++ * Retrieve the static framebuffer reservation of the vGPU type in bytes ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param fbReservation Reference to return the framebuffer reservation ++ * @return ++ * - \ref NVML_SUCCESS Successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a vgpuTypeId is invalid, or \a fbReservation is NULL ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetFbReservation(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbReservation); ++ ++/** ++ * Set the desirable vGPU capability of a device ++ * ++ * Refer to the \a nvmlDeviceVgpuCapability_t structure for the specific capabilities that can be set. ++ * See \ref nvmlEnableState_t for available state. ++ * ++ * @param device The identifier of the target device ++ * @param capability Specifies the \a nvmlDeviceVgpuCapability_t to be set ++ * @param state The target capability mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS Successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid, or \a capability is invalid, or \a state is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state, or \a device not in vGPU mode ++ * - \ref NVML_ERROR_UNKNOWN On any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlDeviceSetVgpuCapabilities(nvmlDevice_t device, nvmlDeviceVgpuCapability_t capability, nvmlEnableState_t state); ++ ++/** ++ * Retrieve the vGPU Software licensable features. ++ * ++ * Identifies whether the system supports vGPU Software Licensing. If it does, return the list of licensable feature(s) ++ * and their current license status. ++ * ++ * @param device Identifier of the target device ++ * @param pGridLicensableFeatures Pointer to structure in which vGPU software licensable features are returned ++ * ++ * @return ++ * - \ref NVML_SUCCESS if licensable features are successfully retrieved ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlVgpu vGPU Management ++ * @{ ++ * ++ * This chapter describes APIs supporting NVIDIA vGPU. ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Retrieve the requested vGPU driver capability. ++ * ++ * Refer to the \a nvmlVgpuDriverCapability_t structure for the specific capabilities that can be queried. ++ * The return value in \a capResult should be treated as a boolean, with a non-zero value indicating that the capability ++ * is supported. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param capability Specifies the \a nvmlVgpuDriverCapability_t to be queried ++ * @param capResult A boolean for the queried capability indicating that feature is supported ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a capability is invalid, or \a capResult is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED the API is not supported in current state or \a devices not in vGPU mode ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlGetVgpuDriverCapabilities(nvmlVgpuDriverCapability_t capability, unsigned int *capResult); ++ ++/** ++ * Retrieve the requested vGPU capability for GPU. ++ * ++ * Refer to the \a nvmlDeviceVgpuCapability_t structure for the specific capabilities that can be queried. ++ * The return value in \a capResult reports a non-zero value indicating that the capability ++ * is supported, and also reports the capability's data based on the queried capability. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param capability Specifies the \a nvmlDeviceVgpuCapability_t to be queried ++ * @param capResult Specifies that the queried capability is supported, and also returns capability's data ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a capability is invalid, or \a capResult is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED the API is not supported in current state or \a device not in vGPU mode ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuCapabilities(nvmlDevice_t device, nvmlDeviceVgpuCapability_t capability, unsigned int *capResult); ++ ++/** ++ * Retrieve the supported vGPU types on a physical GPU (device). ++ * ++ * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer ++ * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount ++ * is used to return the number of vGPU types written to the buffer. ++ * ++ * If the supplied buffer is not large enough to accommodate the vGPU type array, the function returns ++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. ++ * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0. ++ * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported. ++ * ++ * @param device The identifier of the target device ++ * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types ++ * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); ++ ++/** ++ * Retrieve the currently creatable vGPU types on a physical GPU (device). ++ * ++ * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer ++ * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount ++ * is used to return the number of vGPU types written to the buffer. ++ * ++ * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types ++ * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable ++ * list will be restricted to whatever vGPU type is already running on the device. ++ * ++ * If the supplied buffer is not large enough to accommodate the vGPU type array, the function returns ++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. ++ * To query the number of vGPU types that can be created for the GPU, call this function with *vgpuCount = 0. ++ * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable. ++ * ++ * @param device The identifier of the target device ++ * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types ++ * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); ++ ++/** ++ * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). ++ * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param vgpuTypeClass Pointer to string array to return class in ++ * @param size Size of string ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size); ++ ++/** ++ * Retrieve the vGPU type name. ++ * ++ * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not ++ * exceed 64 characters in length (including the NUL terminator). See \ref ++ * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param vgpuTypeName Pointer to buffer to return name ++ * @param size Size of buffer ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a name is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size); ++ ++/** ++ * Retrieve the GPU Instance Profile ID for the given vGPU type ID. ++ * The API will return a valid GPU Instance Profile ID for the MIG capable vGPU types, else INVALID_GPU_INSTANCE_PROFILE_ID is ++ * returned. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param gpuInstanceProfileId GPU Instance Profile ID ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_NOT_SUPPORTED if \a device is not in vGPU Host virtualization mode ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a gpuInstanceProfileId is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetGpuInstanceProfileId(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *gpuInstanceProfileId); ++ ++/** ++ * Retrieve the device ID of a vGPU type. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value ++ * @param subsystemID Subsystem ID and subsystem vendor ID of the device contained in single 32 bit value ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID); ++ ++/** ++ * Retrieve the vGPU framebuffer size in bytes. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param fbSize Pointer to framebuffer size in bytes ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a fbSize is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize); ++ ++/** ++ * Retrieve count of vGPU's supported display heads. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param numDisplayHeads Pointer to number of display heads ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads); ++ ++/** ++ * Retrieve vGPU display head's maximum supported resolution. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param displayIndex Zero-based index of display head ++ * @param xdim Pointer to maximum number of pixels in X dimension ++ * @param ydim Pointer to maximum number of pixels in Y dimension ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex ++ * is out of range. ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim); ++ ++/** ++ * Retrieve license requirements for a vGPU type ++ * ++ * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form ++ * ",", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license, ++ * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0". ++ * ++ * The total length of the returned string will not exceed 128 characters, including the NUL terminator. ++ * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param vgpuTypeLicenseString Pointer to buffer to return license info ++ * @param size Size of \a vgpuTypeLicenseString buffer ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size); ++ ++/** ++ * Retrieve the static frame rate limit value of the vGPU type ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param frameRateLimit Reference to return the frame rate limit value ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a frameRateLimit is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit); ++ ++/** ++ * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param vgpuTypeId Handle to vGPU type ++ * @param vgpuInstanceCount Pointer to get the max number of vGPU instances ++ * that can be created on a deicve for given vgpuTypeId ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid or is not supported on target device, ++ * or \a vgpuInstanceCount is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount); ++ ++/** ++ * Retrieve the maximum number of vGPU instances supported per VM for given vGPU type ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param vgpuInstanceCountPerVm Pointer to get the max number of vGPU instances supported per VM for given \a vgpuTypeId ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuInstanceCountPerVm is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstancesPerVm(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCountPerVm); ++ ++/** ++ * Retrieve the BAR1 info for given vGPU type. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuTypeId Handle to vGPU type ++ * @param bar1Info Pointer to the vGPU type BAR1 information structure \a nvmlVgpuTypeBar1Info_t ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a bar1Info is NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetBAR1Info(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuTypeBar1Info_t *bar1Info); ++ ++/** ++ * Retrieve the active vGPU instances on a device. ++ * ++ * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The ++ * array element count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances ++ * written to the buffer. ++ * ++ * If the supplied buffer is not large enough to accommodate the vGPU instance array, the function returns ++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount. ++ * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return ++ * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target device ++ * @param vgpuCount Pointer which passes in the array size as well as get ++ * back the number of types ++ * @param vgpuInstances Pointer to array in which to return list of vGPU instances ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuCount is NULL ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances); ++ ++/** ++ * Retrieve the VM ID associated with a vGPU instance. ++ * ++ * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator). ++ * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. ++ * ++ * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param vmId Pointer to caller-supplied buffer to hold VM ID ++ * @param size Size of buffer in bytes ++ * @param vmIdType Pointer to hold VM ID type ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vmId or \a vmIdType is NULL, or \a vgpuInstance is 0 ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType); ++ ++/** ++ * Retrieve the UUID of a vGPU instance. ++ * ++ * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string, ++ * not exceeding 80 characters in length (including the NULL terminator). ++ * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param uuid Pointer to caller-supplied buffer to hold vGPU UUID ++ * @param size Size of buffer in bytes ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a uuid is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size); ++ ++/** ++ * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU. ++ * ++ * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version ++ * string will not exceed 80 characters in length (including the NUL terminator). ++ * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. ++ * ++ * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is ++ * returned as "Not Available" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the ++ * NVIDIA driver is loaded and initialized. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param version Caller-supplied buffer to return driver version string ++ * @param length Size of \a version buffer ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a version has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0 ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length); ++ ++/** ++ * Retrieve the framebuffer usage in bytes. ++ * ++ * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance The identifier of the target instance ++ * @param fbUsage Pointer to framebuffer usage in bytes ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbUsage is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage); ++ ++/** ++ * @deprecated Use \ref nvmlVgpuInstanceGetLicenseInfo_v2. ++ * ++ * Retrieve the current licensing state of the vGPU instance. ++ * ++ * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param licensed Reference to return the licensing status ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a licensed has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a licensed is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); ++ ++/** ++ * Retrieve the vGPU type of a vGPU instance. ++ * ++ * Returns the vGPU type ID of vgpu assigned to the vGPU instance. ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param vgpuTypeId Reference to return the vgpuTypeId ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a vgpuTypeId has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a vgpuTypeId is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId); ++ ++/** ++ * Retrieve the frame rate limit set for the vGPU instance. ++ * ++ * Returns the value of the frame rate limit set for the vGPU instance ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param frameRateLimit Reference to return the frame rate limit ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a frameRateLimit has been set ++ * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a frameRateLimit is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit); ++ ++/** ++ * Retrieve the current ECC mode of vGPU instance. ++ * ++ * @param vgpuInstance The identifier of the target vGPU instance ++ * @param eccMode Reference in which to return the current ECC mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the vgpuInstance's ECC mode has been successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEccMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *eccMode); ++ ++/** ++ * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param encoderCapacity Reference to an unsigned int for the encoder capacity ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a encoderCapacity has been retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderQueryType is invalid ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity); ++ ++/** ++ * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param encoderCapacity Unsigned int for the encoder capacity value ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a encoderCapacity has been set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderCapacity is out of range of 0-100. ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity); ++ ++/** ++ * Retrieves the current encoder statistics of a vGPU Instance ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param sessionCount Reference to an unsigned int for count of active encoder sessions ++ * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions ++ * @param averageLatency Reference to an unsigned int for encode latency in microseconds ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL ++ * or \a vgpuInstance is 0. ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, ++ unsigned int *averageFps, unsigned int *averageLatency); ++ ++/** ++ * Retrieves information about all active encoder sessions on a vGPU Instance. ++ * ++ * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The ++ * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions ++ * written to the buffer. ++ * ++ * If the supplied buffer is not large enough to accommodate the active session array, the function returns ++ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. ++ * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return ++ * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param sessionCount Reference to caller supplied array size, and returns ++ * the number of sessions. ++ * @param sessionInfo Reference to caller supplied array in which the list ++ * of session information us returned. ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a sessionInfo is fetched ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is ++ returned in \a sessionCount ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL, or \a vgpuInstance is 0. ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo); ++ ++/** ++* Retrieves the active frame buffer capture sessions statistics of a vGPU Instance ++* ++* For Maxwell &tm; or newer fully supported devices. ++* ++* @param vgpuInstance Identifier of the target vGPU instance ++* @param fbcStats Reference to nvmlFBCStats_t structure containing NvFBC stats ++* ++* @return ++* - \ref NVML_SUCCESS if \a fbcStats is fetched ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbcStats is NULL ++* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance, nvmlFBCStats_t *fbcStats); ++ ++/** ++* Retrieves information about active frame buffer capture sessions on a vGPU Instance. ++* ++* An array of active FBC sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The ++* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions ++* written to the buffer. ++* ++* If the supplied buffer is not large enough to accommodate the active session array, the function returns ++* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. ++* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return ++* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. ++* ++* For Maxwell &tm; or newer fully supported devices. ++* ++* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may ++* be zero if there are no new frames captured since the session started. ++* ++* @param vgpuInstance Identifier of the target vGPU instance ++* @param sessionCount Reference to caller supplied array size, and returns the number of sessions. ++* @param sessionInfo Reference in which to return the session information ++* ++* @return ++* - \ref NVML_SUCCESS if \a sessionInfo is fetched ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a sessionCount is NULL. ++* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); ++ ++/** ++* Retrieve the GPU Instance ID for the given vGPU Instance. ++* The API will return a valid GPU Instance ID for MIG backed vGPU Instance, else INVALID_GPU_INSTANCE_ID is returned. ++* ++* For Kepler &tm; or newer fully supported devices. ++* ++* @param vgpuInstance Identifier of the target vGPU instance ++* @param gpuInstanceId GPU Instance ID ++* ++* @return ++* - \ref NVML_SUCCESS successful completion ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a gpuInstanceId is NULL. ++* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetGpuInstanceId(nvmlVgpuInstance_t vgpuInstance, unsigned int *gpuInstanceId); ++ ++/** ++* Retrieves the PCI Id of the given vGPU Instance i.e. the PCI Id of the GPU as seen inside the VM. ++* ++* The vGPU PCI id is returned as "00000000:00:00.0" if NVIDIA driver is not installed on the vGPU instance. ++* ++* @param vgpuInstance Identifier of the target vGPU instance ++* @param vgpuPciId Caller-supplied buffer to return vGPU PCI Id string ++* @param length Size of the vgpuPciId buffer ++* ++* @return ++* - \ref NVML_SUCCESS if vGPU PCI Id is sucessfully retrieved ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a vgpuPciId is NULL ++* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++* - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance ++* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small, \a length is set to required length ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetGpuPciId(nvmlVgpuInstance_t vgpuInstance, char *vgpuPciId, unsigned int *length); ++ ++/** ++* Retrieve the requested capability for a given vGPU type. Refer to the \a nvmlVgpuCapability_t structure ++* for the specific capabilities that can be queried. The return value in \a capResult should be treated as ++* a boolean, with a non-zero value indicating that the capability is supported. ++* ++* For Maxwell &tm; or newer fully supported devices. ++* ++* @param vgpuTypeId Handle to vGPU type ++* @param capability Specifies the \a nvmlVgpuCapability_t to be queried ++* @param capResult A boolean for the queried capability indicating that feature is supported ++* ++* @return ++* - \ref NVML_SUCCESS successful completion ++* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a capability is invalid, or \a capResult is NULL ++* - \ref NVML_ERROR_UNKNOWN on any unexpected error ++*/ ++nvmlReturn_t DECLDIR nvmlVgpuTypeGetCapabilities(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuCapability_t capability, unsigned int *capResult); ++ ++/** ++ * Retrieve the MDEV UUID of a vGPU instance. ++ * ++ * The MDEV UUID is a globally unique identifier of the mdev device assigned to the VM, and is returned as a 5-part hexadecimal string, ++ * not exceeding 80 characters in length (including the NULL terminator). ++ * MDEV UUID is displayed only on KVM platform. ++ * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param mdevUuid Pointer to caller-supplied buffer to hold MDEV UUID ++ * @param size Size of buffer in bytes ++ * ++ * @return ++ * - \ref NVML_SUCCESS successful completion ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_NOT_SUPPORTED on any hypervisor other than KVM ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mdevUuid is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char *mdevUuid, unsigned int size); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvml vGPU Migration ++ * This chapter describes operations that are associated with vGPU Migration. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Structure representing range of vGPU versions. ++ */ ++typedef struct nvmlVgpuVersion_st ++{ ++ unsigned int minVersion; //!< Minimum vGPU version. ++ unsigned int maxVersion; //!< Maximum vGPU version. ++} nvmlVgpuVersion_t; ++ ++/** ++ * vGPU metadata structure. ++ */ ++typedef struct nvmlVgpuMetadata_st ++{ ++ unsigned int version; //!< Current version of the structure ++ unsigned int revision; //!< Current revision of the structure ++ nvmlVgpuGuestInfoState_t guestInfoState; //!< Current state of Guest-dependent fields ++ char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest ++ char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host ++ unsigned int reserved[6]; //!< Reserved for internal use ++ unsigned int vgpuVirtualizationCaps; //!< vGPU virtualization capabilities bitfield ++ unsigned int guestVgpuVersion; //!< vGPU version of guest driver ++ unsigned int opaqueDataSize; //!< Size of opaque data field in bytes ++ char opaqueData[4]; //!< Opaque data ++} nvmlVgpuMetadata_t; ++ ++/** ++ * Physical GPU metadata structure ++ */ ++typedef struct nvmlVgpuPgpuMetadata_st ++{ ++ unsigned int version; //!< Current version of the structure ++ unsigned int revision; //!< Current revision of the structure ++ char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version ++ unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualization capabilities bitfield ++ unsigned int reserved[5]; //!< Reserved for internal use ++ nvmlVgpuVersion_t hostSupportedVgpuRange; //!< vGPU version range supported by host driver ++ unsigned int opaqueDataSize; //!< Size of opaque data field in bytes ++ char opaqueData[4]; //!< Opaque data ++} nvmlVgpuPgpuMetadata_t; ++ ++/** ++ * vGPU VM compatibility codes ++ */ ++typedef enum nvmlVgpuVmCompatibility_enum ++{ ++ NVML_VGPU_VM_COMPATIBILITY_NONE = 0x0, //!< vGPU is not runnable ++ NVML_VGPU_VM_COMPATIBILITY_COLD = 0x1, //!< vGPU is runnable from a cold / powered-off state (ACPI S5) ++ NVML_VGPU_VM_COMPATIBILITY_HIBERNATE = 0x2, //!< vGPU is runnable from a hibernated state (ACPI S4) ++ NVML_VGPU_VM_COMPATIBILITY_SLEEP = 0x4, //!< vGPU is runnable from a sleeped state (ACPI S3) ++ NVML_VGPU_VM_COMPATIBILITY_LIVE = 0x8 //!< vGPU is runnable from a live/paused (ACPI S0) ++} nvmlVgpuVmCompatibility_t; ++ ++/** ++ * vGPU-pGPU compatibility limit codes ++ */ ++typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum ++{ ++ NVML_VGPU_COMPATIBILITY_LIMIT_NONE = 0x0, //!< Compatibility is not limited. ++ NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER = 0x1, //!< ompatibility is limited by host driver version. ++ NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER = 0x2, //!< Compatibility is limited by guest driver version. ++ NVML_VGPU_COMPATIBILITY_LIMIT_GPU = 0x4, //!< Compatibility is limited by GPU hardware. ++ NVML_VGPU_COMPATIBILITY_LIMIT_OTHER = 0x80000000 //!< Compatibility is limited by an undefined factor. ++} nvmlVgpuPgpuCompatibilityLimitCode_t; ++ ++/** ++ * vGPU-pGPU compatibility structure ++ */ ++typedef struct nvmlVgpuPgpuCompatibility_st ++{ ++ nvmlVgpuVmCompatibility_t vgpuVmCompatibility; //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t ++ nvmlVgpuPgpuCompatibilityLimitCode_t compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t ++} nvmlVgpuPgpuCompatibility_t; ++ ++/** ++ * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM ++ * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section ++ * containing internal state. ++ * ++ * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are ++ * dependent on information obtained from the guest VM, which may not yet have reached a state where that information ++ * is available. The current state of these dependent fields is reflected in the info structure's \ref nvmlVgpuGuestInfoState_t field. ++ * ++ * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide ++ * it to Virtual GPU Manager when creating a vGPU for subsequent instances of the VM. ++ * ++ * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure ++ * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed ++ * in \a bufferSize. ++ * ++ * @param vgpuInstance vGPU instance handle ++ * @param vgpuMetadata Pointer to caller-supplied buffer into which vGPU metadata is written ++ * @param bufferSize Size of vgpuMetadata buffer ++ * ++ * @return ++ * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE vgpuMetadata buffer is too small, required size is returned in \a bufferSize ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a vgpuInstance is 0; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0. ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize); ++ ++/** ++ * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about ++ * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section ++ * containing internal state. ++ * ++ * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata ++ * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed ++ * in \a bufferSize. ++ * ++ * @param device The identifier of the target device ++ * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written ++ * @param bufferSize Pointer to size of \a pgpuMetadata buffer ++ * ++ * @return ++ * - \ref NVML_SUCCESS GPU metadata structure was successfully returned ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE pgpuMetadata buffer is too small, required size is returned in \a bufferSize ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. ++ * - \ref NVML_ERROR_NOT_SUPPORTED vGPU is not supported by the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize); ++ ++/** ++ * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a ++ * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the ++ * physical GPU. ++ * ++ * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The ++ * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility ++ * with the physical GPU is limited, a limit code indicates the factor limiting compatability. ++ * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details). ++ * ++ * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to ++ * boot a given vGPU or associated VM. ++ * ++ * @param vgpuMetadata Pointer to caller-supplied vGPU metadata structure ++ * @param pgpuMetadata Pointer to caller-supplied GPU metadata structure ++ * @param compatibilityInfo Pointer to caller-supplied buffer to hold compatibility info ++ * ++ * @return ++ * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo); ++ ++/** ++ * Returns the properties of the physical GPU indicated by the device in an ascii-encoded string format. ++ * ++ * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the ++ * string is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed ++ * in \a bufferSize. ++ * ++ * @param device The identifier of the target device ++ * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written ++ * @param bufferSize Pointer to size of \a pgpuMetadata buffer ++ * ++ * @return ++ * - \ref NVML_SUCCESS GPU metadata structure was successfully returned ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a pgpuMetadata buffer is too small, required size is returned in \a bufferSize ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the system ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetPgpuMetadataString(nvmlDevice_t device, char *pgpuMetadata, unsigned int *bufferSize); ++ ++/** ++ * Returns the vGPU Software scheduler logs. ++ * \a pSchedulerLog points to a caller-allocated structure to contain the logs. The number of elements returned will ++ * never exceed \a NVML_SCHEDULER_SW_MAX_LOG_ENTRIES. ++ * ++ * To get the entire logs, call the function atleast 5 times a second. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target \a device ++ * @param pSchedulerLog Reference in which \a pSchedulerLog is written ++ * ++ * @return ++ * - \ref NVML_SUCCESS vGPU scheduler logs were successfully obtained ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pSchedulerLog is NULL or \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerLog(nvmlDevice_t device, nvmlVgpuSchedulerLog_t *pSchedulerLog); ++ ++/** ++ * Returns the vGPU scheduler state. ++ * The information returned in \a nvmlVgpuSchedulerGetState_t is not relevant if the BEST EFFORT policy is set. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target \a device ++ * @param pSchedulerState Reference in which \a pSchedulerState is returned ++ * ++ * @return ++ * - \ref NVML_SUCCESS vGPU scheduler state is successfully obtained ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pSchedulerState is NULL or \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerGetState_t *pSchedulerState); ++ ++/** ++ * Returns the vGPU scheduler capabilities. ++ * The list of supported vGPU schedulers returned in \a nvmlVgpuSchedulerCapabilities_t is from ++ * the NVML_VGPU_SCHEDULER_POLICY_*. This list enumerates the supported scheduler policies ++ * if the engine is Graphics type. ++ * The other values in \a nvmlVgpuSchedulerCapabilities_t are also applicable if the engine is ++ * Graphics type. For other engine types, it is BEST EFFORT policy. ++ * If ARR is supported and enabled, scheduling frequency and averaging factor are applicable ++ * else timeSlice is applicable. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * @param device The identifier of the target \a device ++ * @param pCapabilities Reference in which \a pCapabilities is written ++ * ++ * @return ++ * - \ref NVML_SUCCESS vGPU scheduler capabilities were successfully obtained ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pCapabilities is NULL or \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerCapabilities(nvmlDevice_t device, nvmlVgpuSchedulerCapabilities_t *pCapabilities); ++ ++/** ++ * Sets the vGPU scheduler state. ++ * ++ * For Pascal &tm; or newer fully supported devices. ++ * ++ * The scheduler state change won't persist across module load/unload. ++ * Scheduler state and params will be allowed to set only when no VM is running. ++ * In \a nvmlVgpuSchedulerSetState_t, IFF enableARRMode is enabled then ++ * provide avgFactorForARR and frequency as input. If enableARRMode is disabled ++ * then provide timeslice as input. ++ * ++ * @param device The identifier of the target \a device ++ * @param pSchedulerState vGPU \a pSchedulerState to set ++ * ++ * @return ++ * - \ref NVML_SUCCESS vGPU scheduler state has been successfully set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pSchedulerState is NULL or \a device is invalid ++ * - \ref NVML_ERROR_RESET_REQUIRED if setting \a pSchedulerState failed with fatal error, ++ * reboot is required to overcome from this error. ++ * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode ++ * or if any vGPU instance currently exists on the \a device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerSetState_t *pSchedulerState); ++ ++/* ++ * Virtual GPU (vGPU) version ++ * ++ * The NVIDIA vGPU Manager and the guest drivers are tagged with a range of supported vGPU versions. This determines the range of NVIDIA guest driver versions that ++ * are compatible for vGPU feature support with a given NVIDIA vGPU Manager. For vGPU feature support, the range of supported versions for the NVIDIA vGPU Manager ++ * and the guest driver must overlap. Otherwise, the guest driver fails to load in the VM. ++ * ++ * When the NVIDIA guest driver loads, either when the VM is booted or when the driver is installed or upgraded, a negotiation occurs between the guest driver ++ * and the NVIDIA vGPU Manager to select the highest mutually compatible vGPU version. The negotiated vGPU version stays the same across VM migration. ++ */ ++ ++/** ++ * Query the ranges of supported vGPU versions. ++ * ++ * This function gets the linear range of supported vGPU versions that is preset for the NVIDIA vGPU Manager and the range set by an administrator. ++ * If the preset range has not been overridden by \ref nvmlSetVgpuVersion, both ranges are the same. ++ * ++ * The caller passes pointers to the following \ref nvmlVgpuVersion_t structures, into which the NVIDIA vGPU Manager writes the ranges: ++ * 1. \a supported structure that represents the preset range of vGPU versions supported by the NVIDIA vGPU Manager. ++ * 2. \a current structure that represents the range of supported vGPU versions set by an administrator. By default, this range is the same as the preset range. ++ * ++ * @param supported Pointer to the structure in which the preset range of vGPU versions supported by the NVIDIA vGPU Manager is written ++ * @param current Pointer to the structure in which the range of supported vGPU versions set by an administrator is written ++ * ++ * @return ++ * - \ref NVML_SUCCESS The vGPU version range structures were successfully obtained. ++ * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported. ++ * - \ref NVML_ERROR_INVALID_ARGUMENT The \a supported parameter or the \a current parameter is NULL. ++ * - \ref NVML_ERROR_UNKNOWN An error occurred while the data was being fetched. ++ */ ++nvmlReturn_t DECLDIR nvmlGetVgpuVersion(nvmlVgpuVersion_t *supported, nvmlVgpuVersion_t *current); ++ ++/** ++ * Override the preset range of vGPU versions supported by the NVIDIA vGPU Manager with a range set by an administrator. ++ * ++ * This function configures the NVIDIA vGPU Manager with a range of supported vGPU versions set by an administrator. This range must be a subset of the ++ * preset range that the NVIDIA vGPU Manager supports. The custom range set by an administrator takes precedence over the preset range and is advertised to ++ * the guest VM for negotiating the vGPU version. See \ref nvmlGetVgpuVersion for details of how to query the preset range of versions supported. ++ * ++ * This function takes a pointer to vGPU version range structure \ref nvmlVgpuVersion_t as input to override the preset vGPU version range that the NVIDIA vGPU Manager supports. ++ * ++ * After host system reboot or driver reload, the range of supported versions reverts to the range that is preset for the NVIDIA vGPU Manager. ++ * ++ * @note 1. The range set by the administrator must be a subset of the preset range that the NVIDIA vGPU Manager supports. Otherwise, an error is returned. ++ * 2. If the range of supported guest driver versions does not overlap the range set by the administrator, the guest driver fails to load. ++ * 3. If the range of supported guest driver versions overlaps the range set by the administrator, the guest driver will load with a negotiated ++ * vGPU version that is the maximum value in the overlapping range. ++ * 4. No VMs must be running on the host when this function is called. If a VM is running on the host, the call to this function fails. ++ * ++ * @param vgpuVersion Pointer to a caller-supplied range of supported vGPU versions. ++ * ++ * @return ++ * - \ref NVML_SUCCESS The preset range of supported vGPU versions was successfully overridden. ++ * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported. ++ * - \ref NVML_ERROR_IN_USE The range was not overridden because a VM is running on the host. ++ * - \ref NVML_ERROR_INVALID_ARGUMENT The \a vgpuVersion parameter specifies a range that is outside the range supported by the NVIDIA vGPU Manager or if \a vgpuVersion is NULL. ++ */ ++nvmlReturn_t DECLDIR nvmlSetVgpuVersion(nvmlVgpuVersion_t *vgpuVersion); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlUtil vGPU Utilization and Accounting ++ * This chapter describes operations that are associated with vGPU Utilization and Accounting. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Retrieves current utilization for vGPUs on a physical GPU (device). ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running ++ * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer ++ * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the ++ * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values ++ * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to ++ * indicate the returned value type. ++ * ++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with ++ * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance ++ * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate ++ * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with ++ * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the ++ * buffer is sized for. ++ * ++ * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample ++ * structures that were actually written. This may differ from a previously read value as vGPU instances are created or ++ * destroyed. ++ * ++ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 ++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp ++ * to a timeStamp retrieved from a previous query to read utilization since the previous query. ++ * ++ * @param device The identifier for the target device ++ * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. ++ * @param sampleValType Pointer to caller-supplied buffer to hold the type of returned sample values ++ * @param vgpuInstanceSamplesCount Pointer to caller-supplied array size, and returns number of vGPU instances ++ * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU utilization samples are returned ++ ++ * @return ++ * - \ref NVML_SUCCESS if utilization samples are successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is ++ * NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all ++ * vGPU instances currently executing on the device ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, ++ nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount, ++ nvmlVgpuInstanceUtilizationSample_t *utilizationSamples); ++ ++/** ++ * Retrieves recent utilization for vGPU instances running on a physical GPU (device). ++ * ++ * For Kepler &tm; or newer fully supported devices. ++ * ++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, video decoder, jpeg decoder, and OFA for vGPU ++ * instances running on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied ++ * buffer pointed at by \a vgpuUtilInfo->vgpuUtilArray. One utilization sample structure is returned per vGPU instance, and includes the ++ * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values ++ * in nvmlValue_t unions. The function sets the caller-supplied \a vgpuUtilInfo->sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to ++ * indicate the returned value type. ++ * ++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with ++ * \a vgpuUtilInfo->vgpuUtilArray set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance ++ * count in \a vgpuUtilInfo->vgpuInstanceCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate ++ * a buffer of size vgpuUtilInfo->vgpuInstanceCount * sizeof(nvmlVgpuInstanceUtilizationInfo_t). Invoke the function again with ++ * the allocated buffer passed in \a vgpuUtilInfo->vgpuUtilArray, and \a vgpuUtilInfo->vgpuInstanceCount set to the number of entries the ++ * buffer is sized for. ++ * ++ * On successful return, the function updates \a vgpuUtilInfo->vgpuInstanceCount with the number of vGPU utilization sample ++ * structures that were actually written. This may differ from a previously read value as vGPU instances are created or ++ * destroyed. ++ * ++ * \a vgpuUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 ++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set \a vgpuUtilInfo->lastSeenTimeStamp ++ * to a timeStamp retrieved from a previous query to read utilization since the previous query. ++ * ++ * @param device The identifier for the target device ++ * @param vgpuUtilInfo Pointer to the caller-provided structure of nvmlVgpuInstancesUtilizationInfo_t ++ ++ * @return ++ * - \ref NVML_SUCCESS if utilization samples are successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuUtilInfo is NULL, or \a vgpuUtilInfo->vgpuInstanceCount is 0 ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version of \a vgpuUtilInfo is invalid ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a vgpuUtilInfo->vgpuUtilArray is NULL, or the buffer size of vgpuUtilInfo->vgpuInstanceCount is too small. ++ * The caller should check the current vGPU instance count from the returned vgpuUtilInfo->vgpuInstanceCount, and call ++ * the function again with a buffer of size vgpuUtilInfo->vgpuInstanceCount * sizeof(nvmlVgpuInstanceUtilizationInfo_t) ++ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuInstancesUtilizationInfo(nvmlDevice_t device, ++ nvmlVgpuInstancesUtilizationInfo_t *vgpuUtilInfo); ++ ++/** ++ * Retrieves current utilization for processes running on vGPUs on a physical GPU (device). ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on ++ * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the ++ * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running ++ * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which ++ * the samples were recorded. Individual utilization values are returned as "unsigned int" values. ++ * ++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with ++ * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance ++ * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size ++ * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with ++ * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the ++ * buffer is sized for. ++ * ++ * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample ++ * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active ++ * in any given sample period. ++ * ++ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 ++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp ++ * to a timeStamp retrieved from a previous query to read utilization since the previous query. ++ * ++ * @param device The identifier for the target device ++ * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. ++ * @param vgpuProcessSamplesCount Pointer to caller-supplied array size, and returns number of processes running on vGPU instances ++ * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned ++ ++ * @return ++ * - \ref NVML_SUCCESS if utilization samples are successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is ++ * passed with a non-NULL \a utilizationSamples ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all ++ * vGPU instances currently executing on the device ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, ++ unsigned int *vgpuProcessSamplesCount, ++ nvmlVgpuProcessUtilizationSample_t *utilizationSamples); ++ ++/** ++ * Retrieves recent utilization for processes running on vGPU instances on a physical GPU (device). ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, video decoder, jpeg decoder, and OFA for processes running ++ * on vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied ++ * buffer pointed at by \a vgpuProcUtilInfo->vgpuProcUtilArray. One utilization sample structure is returned per process running ++ * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which ++ * the samples were recorded. Individual utilization values are returned as "unsigned int" values. ++ * ++ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with ++ * \a vgpuProcUtilInfo->vgpuProcUtilArray set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current processes' count ++ * running on vGPU instances in \a vgpuProcUtilInfo->vgpuProcessCount. The caller should allocate a buffer of size ++ * vgpuProcUtilInfo->vgpuProcessCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed ++ * in \a vgpuProcUtilInfo->vgpuProcUtilArray, and \a vgpuProcUtilInfo->vgpuProcessCount set to the number of entries the buffer is sized for. ++ * ++ * On successful return, the function updates \a vgpuProcUtilInfo->vgpuProcessCount with the number of vGPU sub process utilization sample ++ * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active ++ * in any given sample period. ++ * ++ * vgpuProcUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 ++ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set vgpuProcUtilInfo->lastSeenTimeStamp ++ * to a timeStamp retrieved from a previous query to read utilization since the previous query. ++ * ++ * @param device The identifier for the target device ++ * @param vgpuProcUtilInfo Pointer to the caller-provided structure of nvmlVgpuProcessesUtilizationInfo_t ++ ++ * @return ++ * - \ref NVML_SUCCESS if utilization samples are successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuProcUtilInfo is null ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version of \a vgpuProcUtilInfo is invalid ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a vgpuProcUtilInfo->vgpuProcUtilArray is null, or supplied \a vgpuProcUtilInfo->vgpuProcessCount ++ * is too small to return samples for all processes on vGPU instances currently executing on the device. ++ * The caller should check the current processes count from the returned \a vgpuProcUtilInfo->vgpuProcessCount, ++ * and call the function again with a buffer of size ++ * vgpuProcUtilInfo->vgpuProcessCount * sizeof(nvmlVgpuProcessUtilizationSample_t) ++ * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessesUtilizationInfo(nvmlDevice_t device, nvmlVgpuProcessesUtilizationInfo_t *vgpuProcUtilInfo); ++ ++/** ++ * Queries the state of per process accounting mode on vGPU. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance The identifier of the target vGPU instance ++ * @param mode Reference in which to return the current accounting mode ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the mode has been successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature ++ * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *mode); ++ ++/** ++ * Queries list of processes running on vGPU that can be queried for accounting stats. The list of processes ++ * returned can be in running or terminated state. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * To just query the maximum number of processes that can be queried, call this function with *count = 0 and ++ * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. ++ * ++ * For more details see \ref nvmlVgpuInstanceGetAccountingStats. ++ * ++ * @note In case of PID collision some processes might not be accessible before the circular buffer is full. ++ * ++ * @param vgpuInstance The identifier of the target vGPU instance ++ * @param count Reference in which to provide the \a pids array size, and ++ * to return the number of elements ready to be queried ++ * @param pids Reference in which to return list of process ids ++ * ++ * @return ++ * - \ref NVML_SUCCESS if pids were successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a count is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled ++ * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to expected value) ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ * ++ * @see nvmlVgpuInstanceGetAccountingPids ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, unsigned int *count, unsigned int *pids); ++ ++/** ++ * Queries process's accounting stats. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * Accounting stats capture GPU utilization and other statistics across the lifetime of a process, and ++ * can be queried during life time of the process or after its termination. ++ * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and ++ * updated to actual running time after its termination. ++ * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old ++ * processes. ++ * ++ * See \ref nvmlAccountingStats_t for description of each returned metric. ++ * List of processes that can be queried can be retrieved from \ref nvmlVgpuInstanceGetAccountingPids. ++ * ++ * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. ++ * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be ++ * queried since they don't contribute to GPU utilization. ++ * @note In case of pid collision stats of only the latest process (that terminated last) will be reported ++ * ++ * @param vgpuInstance The identifier of the target vGPU instance ++ * @param pid Process Id of the target process to query stats for ++ * @param stats Reference in which to return the process's accounting stats ++ * ++ * @return ++ * - \ref NVML_SUCCESS if stats have been successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a stats is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * or \a stats is not found ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, unsigned int pid, nvmlAccountingStats_t *stats); ++ ++/** ++ * Clears accounting information of the vGPU instance that have already terminated. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * Requires root/admin permissions. ++ * ++ * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. ++ * @note Only compute and graphics applications stats are reported and can be cleared since monitoring applications ++ * stats don't contribute to GPU utilization. ++ * ++ * @param vgpuInstance The identifier of the target vGPU instance ++ * ++ * @return ++ * - \ref NVML_SUCCESS if accounting information has been cleared ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid ++ * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceClearAccountingPids(nvmlVgpuInstance_t vgpuInstance); ++ ++/** ++ * Query the license information of the vGPU instance. ++ * ++ * For Maxwell &tm; or newer fully supported devices. ++ * ++ * @param vgpuInstance Identifier of the target vGPU instance ++ * @param licenseInfo Pointer to vGPU license information structure ++ * ++ * @return ++ * - \ref NVML_SUCCESS if information is successfully retrieved ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a licenseInfo is NULL ++ * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system ++ * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseInfo_v2(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuLicenseInfo_t *licenseInfo); ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlExcludedGpuQueries Excluded GPU Queries ++ * This chapter describes NVML operations that are associated with excluded GPUs. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Excluded GPU device information ++ **/ ++typedef struct nvmlExcludedDeviceInfo_st ++{ ++ nvmlPciInfo_t pciInfo; //!< The PCI information for the excluded GPU ++ char uuid[NVML_DEVICE_UUID_BUFFER_SIZE]; //!< The ASCII string UUID for the excluded GPU ++} nvmlExcludedDeviceInfo_t; ++ ++ /** ++ * Retrieves the number of excluded GPU devices in the system. ++ * ++ * For all products. ++ * ++ * @param deviceCount Reference in which to return the number of excluded devices ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a deviceCount has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL ++ */ ++nvmlReturn_t DECLDIR nvmlGetExcludedDeviceCount(unsigned int *deviceCount); ++ ++/** ++ * Acquire the device information for an excluded GPU device, based on its index. ++ * ++ * For all products. ++ * ++ * Valid indices are derived from the \a deviceCount returned by ++ * \ref nvmlGetExcludedDeviceCount(). For example, if \a deviceCount is 2 the valid indices ++ * are 0 and 1, corresponding to GPU 0 and GPU 1. ++ * ++ * @param index The index of the target GPU, >= 0 and < \a deviceCount ++ * @param info Reference in which to return the device information ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device has been set ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a info is NULL ++ * ++ * @see nvmlGetExcludedDeviceCount ++ */ ++nvmlReturn_t DECLDIR nvmlGetExcludedDeviceInfoByIndex(unsigned int index, nvmlExcludedDeviceInfo_t *info); ++ ++/** @} */ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlMultiInstanceGPU Multi Instance GPU Management ++ * This chapter describes NVML operations that are associated with Multi Instance GPU management. ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Disable Multi Instance GPU mode. ++ */ ++#define NVML_DEVICE_MIG_DISABLE 0x0 ++ ++/** ++ * Enable Multi Instance GPU mode. ++ */ ++#define NVML_DEVICE_MIG_ENABLE 0x1 ++ ++/** ++ * GPU instance profiles. ++ * ++ * These macros should be passed to \ref nvmlDeviceGetGpuInstanceProfileInfo to retrieve the ++ * detailed information about a GPU instance such as profile ID, engine counts. ++ */ ++#define NVML_GPU_INSTANCE_PROFILE_1_SLICE 0x0 ++#define NVML_GPU_INSTANCE_PROFILE_2_SLICE 0x1 ++#define NVML_GPU_INSTANCE_PROFILE_3_SLICE 0x2 ++#define NVML_GPU_INSTANCE_PROFILE_4_SLICE 0x3 ++#define NVML_GPU_INSTANCE_PROFILE_7_SLICE 0x4 ++#define NVML_GPU_INSTANCE_PROFILE_8_SLICE 0x5 ++#define NVML_GPU_INSTANCE_PROFILE_6_SLICE 0x6 ++#define NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1 0x7 ++#define NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1 0x8 ++#define NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2 0x9 ++#define NVML_GPU_INSTANCE_PROFILE_COUNT 0xA ++ ++/** ++ * MIG GPU instance profile capability. ++ * ++ * Bit field values representing MIG profile capabilities ++ * \ref nvmlGpuInstanceProfileInfo_v3_t.capabilities ++ */ ++#define NVML_GPU_INTSTANCE_PROFILE_CAPS_P2P 0x1 ++ ++/** ++ * MIG compute instance profile capability. ++ * ++ * Bit field values representing MIG profile capabilities ++ * \ref nvmlComputeInstanceProfileInfo_v3_t.capabilities ++ */ ++/* No capabilities for compute profiles currently exposed */ ++ ++typedef struct nvmlGpuInstancePlacement_st ++{ ++ unsigned int start; //!< Index of first occupied memory slice ++ unsigned int size; //!< Number of memory slices occupied ++} nvmlGpuInstancePlacement_t; ++ ++/** ++ * GPU instance profile information. ++ */ ++typedef struct nvmlGpuInstanceProfileInfo_st ++{ ++ unsigned int id; //!< Unique profile ID within the device ++ unsigned int isP2pSupported; //!< Peer-to-Peer support ++ unsigned int sliceCount; //!< GPU Slice count ++ unsigned int instanceCount; //!< GPU instance count ++ unsigned int multiprocessorCount; //!< Streaming Multiprocessor count ++ unsigned int copyEngineCount; //!< Copy Engine count ++ unsigned int decoderCount; //!< Decoder Engine count ++ unsigned int encoderCount; //!< Encoder Engine count ++ unsigned int jpegCount; //!< JPEG Engine count ++ unsigned int ofaCount; //!< OFA Engine count ++ unsigned long long memorySizeMB; //!< Memory size in MBytes ++} nvmlGpuInstanceProfileInfo_t; ++ ++/** ++ * GPU instance profile information (v2). ++ * ++ * Version 2 adds the \ref nvmlGpuInstanceProfileInfo_v2_t.version field ++ * to the start of the structure, and the \ref nvmlGpuInstanceProfileInfo_v2_t.name ++ * field to the end. This structure is not backwards-compatible with ++ * \ref nvmlGpuInstanceProfileInfo_t. ++ */ ++typedef struct nvmlGpuInstanceProfileInfo_v2_st ++{ ++ unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuInstanceProfileInfo_v2) ++ unsigned int id; //!< Unique profile ID within the device ++ unsigned int isP2pSupported; //!< Peer-to-Peer support ++ unsigned int sliceCount; //!< GPU Slice count ++ unsigned int instanceCount; //!< GPU instance count ++ unsigned int multiprocessorCount; //!< Streaming Multiprocessor count ++ unsigned int copyEngineCount; //!< Copy Engine count ++ unsigned int decoderCount; //!< Decoder Engine count ++ unsigned int encoderCount; //!< Encoder Engine count ++ unsigned int jpegCount; //!< JPEG Engine count ++ unsigned int ofaCount; //!< OFA Engine count ++ unsigned long long memorySizeMB; //!< Memory size in MBytes ++ char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name ++} nvmlGpuInstanceProfileInfo_v2_t; ++ ++/** ++ * Version identifier value for \ref nvmlGpuInstanceProfileInfo_v2_t.version. ++ */ ++#define nvmlGpuInstanceProfileInfo_v2 NVML_STRUCT_VERSION(GpuInstanceProfileInfo, 2) ++ ++/** ++ * GPU instance profile information (v3). ++ * ++ * Version 3 removes isP2pSupported field and adds the \ref nvmlGpuInstanceProfileInfo_v3_t.capabilities ++ * field \ref nvmlGpuInstanceProfileInfo_t. ++ */ ++typedef struct nvmlGpuInstanceProfileInfo_v3_st ++{ ++ unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuInstanceProfileInfo_v3) ++ unsigned int id; //!< Unique profile ID within the device ++ unsigned int sliceCount; //!< GPU Slice count ++ unsigned int instanceCount; //!< GPU instance count ++ unsigned int multiprocessorCount; //!< Streaming Multiprocessor count ++ unsigned int copyEngineCount; //!< Copy Engine count ++ unsigned int decoderCount; //!< Decoder Engine count ++ unsigned int encoderCount; //!< Encoder Engine count ++ unsigned int jpegCount; //!< JPEG Engine count ++ unsigned int ofaCount; //!< OFA Engine count ++ unsigned long long memorySizeMB; //!< Memory size in MBytes ++ char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name ++ unsigned int capabilities; //!< Additional capabilities ++} nvmlGpuInstanceProfileInfo_v3_t; ++ ++/** ++ * Version identifier value for \ref nvmlGpuInstanceProfileInfo_v3_t.version. ++ */ ++#define nvmlGpuInstanceProfileInfo_v3 NVML_STRUCT_VERSION(GpuInstanceProfileInfo, 3) ++ ++typedef struct nvmlGpuInstanceInfo_st ++{ ++ nvmlDevice_t device; //!< Parent device ++ unsigned int id; //!< Unique instance ID within the device ++ unsigned int profileId; //!< Unique profile ID within the device ++ nvmlGpuInstancePlacement_t placement; //!< Placement for this instance ++} nvmlGpuInstanceInfo_t; ++ ++typedef struct nvmlGpuInstance_st* nvmlGpuInstance_t; ++ ++/** ++ * Compute instance profiles. ++ * ++ * These macros should be passed to \ref nvmlGpuInstanceGetComputeInstanceProfileInfo to retrieve the ++ * detailed information about a compute instance such as profile ID, engine counts ++ */ ++#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE 0x0 ++#define NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE 0x1 ++#define NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE 0x2 ++#define NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE 0x3 ++#define NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE 0x4 ++#define NVML_COMPUTE_INSTANCE_PROFILE_8_SLICE 0x5 ++#define NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE 0x6 ++#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 0x7 ++#define NVML_COMPUTE_INSTANCE_PROFILE_COUNT 0x8 ++ ++#define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED 0x0 //!< All the engines except multiprocessors would be shared ++#define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT 0x1 ++ ++typedef struct nvmlComputeInstancePlacement_st ++{ ++ unsigned int start; //!< Index of first occupied compute slice ++ unsigned int size; //!< Number of compute slices occupied ++} nvmlComputeInstancePlacement_t; ++ ++/** ++ * Compute instance profile information. ++ */ ++typedef struct nvmlComputeInstanceProfileInfo_st ++{ ++ unsigned int id; //!< Unique profile ID within the GPU instance ++ unsigned int sliceCount; //!< GPU Slice count ++ unsigned int instanceCount; //!< Compute instance count ++ unsigned int multiprocessorCount; //!< Streaming Multiprocessor count ++ unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count ++ unsigned int sharedDecoderCount; //!< Shared Decoder Engine count ++ unsigned int sharedEncoderCount; //!< Shared Encoder Engine count ++ unsigned int sharedJpegCount; //!< Shared JPEG Engine count ++ unsigned int sharedOfaCount; //!< Shared OFA Engine count ++} nvmlComputeInstanceProfileInfo_t; ++ ++/** ++ * Compute instance profile information (v2). ++ * ++ * Version 2 adds the \ref nvmlComputeInstanceProfileInfo_v2_t.version field ++ * to the start of the structure, and the \ref nvmlComputeInstanceProfileInfo_v2_t.name ++ * field to the end. This structure is not backwards-compatible with ++ * \ref nvmlComputeInstanceProfileInfo_t. ++ */ ++typedef struct nvmlComputeInstanceProfileInfo_v2_st ++{ ++ unsigned int version; //!< Structure version identifier (set to \ref nvmlComputeInstanceProfileInfo_v2) ++ unsigned int id; //!< Unique profile ID within the GPU instance ++ unsigned int sliceCount; //!< GPU Slice count ++ unsigned int instanceCount; //!< Compute instance count ++ unsigned int multiprocessorCount; //!< Streaming Multiprocessor count ++ unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count ++ unsigned int sharedDecoderCount; //!< Shared Decoder Engine count ++ unsigned int sharedEncoderCount; //!< Shared Encoder Engine count ++ unsigned int sharedJpegCount; //!< Shared JPEG Engine count ++ unsigned int sharedOfaCount; //!< Shared OFA Engine count ++ char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name ++} nvmlComputeInstanceProfileInfo_v2_t; ++ ++/** ++ * Version identifier value for \ref nvmlComputeInstanceProfileInfo_v2_t.version. ++ */ ++#define nvmlComputeInstanceProfileInfo_v2 NVML_STRUCT_VERSION(ComputeInstanceProfileInfo, 2) ++ ++/** ++ * Compute instance profile information (v3). ++ * ++ * Version 3 adds the \ref nvmlComputeInstanceProfileInfo_v3_t.capabilities field ++ * \ref nvmlComputeInstanceProfileInfo_t. ++ */ ++typedef struct nvmlComputeInstanceProfileInfo_v3_st ++{ ++ unsigned int version; //!< Structure version identifier (set to \ref nvmlComputeInstanceProfileInfo_v3) ++ unsigned int id; //!< Unique profile ID within the GPU instance ++ unsigned int sliceCount; //!< GPU Slice count ++ unsigned int instanceCount; //!< Compute instance count ++ unsigned int multiprocessorCount; //!< Streaming Multiprocessor count ++ unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count ++ unsigned int sharedDecoderCount; //!< Shared Decoder Engine count ++ unsigned int sharedEncoderCount; //!< Shared Encoder Engine count ++ unsigned int sharedJpegCount; //!< Shared JPEG Engine count ++ unsigned int sharedOfaCount; //!< Shared OFA Engine count ++ char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name ++ unsigned int capabilities; //!< Additional capabilities ++} nvmlComputeInstanceProfileInfo_v3_t; ++ ++/** ++ * Version identifier value for \ref nvmlComputeInstanceProfileInfo_v3_t.version. ++ */ ++#define nvmlComputeInstanceProfileInfo_v3 NVML_STRUCT_VERSION(ComputeInstanceProfileInfo, 3) ++ ++typedef struct nvmlComputeInstanceInfo_st ++{ ++ nvmlDevice_t device; //!< Parent device ++ nvmlGpuInstance_t gpuInstance; //!< Parent GPU instance ++ unsigned int id; //!< Unique instance ID within the GPU instance ++ unsigned int profileId; //!< Unique profile ID within the GPU instance ++ nvmlComputeInstancePlacement_t placement; //!< Placement for this instance within the GPU instance's compute slice range {0, sliceCount} ++} nvmlComputeInstanceInfo_t; ++ ++typedef struct nvmlComputeInstance_st* nvmlComputeInstance_t; ++ ++/** ++ * Set MIG mode for the device. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Requires root user. ++ * ++ * This mode determines whether a GPU instance can be created. ++ * ++ * This API may unbind or reset the device to activate the requested mode. Thus, the attributes associated with the ++ * device, such as minor number, might change. The caller of this API is expected to query such attributes again. ++ * ++ * On certain platforms like pass-through virtualization, where reset functionality may not be exposed directly, VM ++ * reboot is required. \a activationStatus would return \ref NVML_ERROR_RESET_REQUIRED for such cases. ++ * ++ * \a activationStatus would return the appropriate error code upon unsuccessful activation. For example, if device ++ * unbind fails because the device isn't idle, \ref NVML_ERROR_IN_USE would be returned. The caller of this API ++ * is expected to idle the device and retry setting the \a mode. ++ * ++ * @note On Windows, only disabling MIG mode is supported. \a activationStatus would return \ref ++ * NVML_ERROR_NOT_SUPPORTED as GPU reset is not supported on Windows through this API. ++ * ++ * @param device The identifier of the target device ++ * @param mode The mode to be set, \ref NVML_DEVICE_MIG_DISABLE or ++ * \ref NVML_DEVICE_MIG_ENABLE ++ * @param activationStatus The activationStatus status ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device,\a mode or \a activationStatus are invalid ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode, nvmlReturn_t *activationStatus); ++ ++/** ++ * Get MIG mode for the device. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * ++ * Changing MIG modes may require device unbind or reset. The "pending" MIG mode refers to the target mode following the ++ * next activation trigger. ++ * ++ * @param device The identifier of the target device ++ * @param currentMode Returns the current mode, \ref NVML_DEVICE_MIG_DISABLE or ++ * \ref NVML_DEVICE_MIG_ENABLE ++ * @param pendingMode Returns the pending mode, \ref NVML_DEVICE_MIG_DISABLE or ++ * \ref NVML_DEVICE_MIG_ENABLE ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a currentMode or \a pendingMode are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode); ++ ++/** ++ * Get GPU instance profile information ++ * ++ * Information provided by this API is immutable throughout the lifetime of a MIG mode. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the target device ++ * @param profile One of the NVML_GPU_INSTANCE_PROFILE_* ++ * @param info Returns detailed profile information ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile or \a info are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG or \a profile isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, unsigned int profile, ++ nvmlGpuInstanceProfileInfo_t *info); ++ ++/** ++ * Versioned wrapper around \ref nvmlDeviceGetGpuInstanceProfileInfo that accepts a versioned ++ * \ref nvmlGpuInstanceProfileInfo_v2_t or later output structure. ++ * ++ * @note The caller must set the \ref nvmlGpuInstanceProfileInfo_v2_t.version field to the ++ * appropriate version prior to calling this function. For example: ++ * \code ++ * nvmlGpuInstanceProfileInfo_v2_t profileInfo = ++ * { .version = nvmlGpuInstanceProfileInfo_v2 }; ++ * nvmlReturn_t result = nvmlDeviceGetGpuInstanceProfileInfoV(device, ++ * profile, ++ * &profileInfo); ++ * \endcode ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device The identifier of the target device ++ * @param profile One of the NVML_GPU_INSTANCE_PROFILE_* ++ * @param info Returns detailed profile information ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a info, or \a info->version are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profile isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice_t device, unsigned int profile, ++ nvmlGpuInstanceProfileInfo_v2_t *info); ++ ++/** ++ * Get GPU instance placements. ++ * ++ * A placement represents the location of a GPU instance within a device. This API only returns all the possible ++ * placements for the given profile regardless of whether MIG is enabled or not. ++ * A created GPU instance occupies memory slices described by its placement. Creation of new GPU instance will ++ * fail if there is overlap with the already occupied memory slices. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param device The identifier of the target device ++ * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo ++ * @param placements Returns placements allowed for the profile. Can be NULL to discover number ++ * of allowed placements for this profile. If non-NULL must be large enough ++ * to accommodate the placements supported by the profile. ++ * @param count Returns number of allowed placemenets for the profile. ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG or \a profileId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements_v2(nvmlDevice_t device, unsigned int profileId, ++ nvmlGpuInstancePlacement_t *placements, ++ unsigned int *count); ++ ++/** ++ * Get GPU instance profile capacity. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param device The identifier of the target device ++ * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo ++ * @param count Returns remaining instance count for the profile ID ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceRemainingCapacity(nvmlDevice_t device, unsigned int profileId, ++ unsigned int *count); ++ ++/** ++ * Create GPU instance. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would ++ * become invalid. The GPU instance must be recreated to acquire a valid handle. ++ * ++ * @param device The identifier of the target device ++ * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo ++ * @param gpuInstance Returns the GPU instance handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a profileId or \a gpuInstance are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested GPU instance could not be created ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstance(nvmlDevice_t device, unsigned int profileId, ++ nvmlGpuInstance_t *gpuInstance); ++ ++/** ++ * Create GPU instance with the specified placement. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would ++ * become invalid. The GPU instance must be recreated to acquire a valid handle. ++ * ++ * @param device The identifier of the target device ++ * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo ++ * @param placement The requested placement. See \ref nvmlDeviceGetGpuInstancePossiblePlacements_v2 ++ * @param gpuInstance Returns the GPU instance handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a profileId, \a placement or \a gpuInstance ++ * are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested GPU instance could not be created ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstanceWithPlacement(nvmlDevice_t device, unsigned int profileId, ++ const nvmlGpuInstancePlacement_t *placement, ++ nvmlGpuInstance_t *gpuInstance); ++/** ++ * Destroy GPU instance. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param gpuInstance The GPU instance handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_IN_USE If the GPU instance is in use. This error would be returned if processes ++ * (e.g. CUDA application) or compute instances are active on the ++ * GPU instance. ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceDestroy(nvmlGpuInstance_t gpuInstance); ++ ++/** ++ * Get GPU instances for given profile ID. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param device The identifier of the target device ++ * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo ++ * @param gpuInstances Returns pre-exiting GPU instances, the buffer must be large enough to ++ * accommodate the instances supported by the profile. ++ * See \ref nvmlDeviceGetGpuInstanceProfileInfo ++ * @param count The count of returned GPU instances ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId, \a gpuInstances or \a count are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstances(nvmlDevice_t device, unsigned int profileId, ++ nvmlGpuInstance_t *gpuInstances, unsigned int *count); ++ ++/** ++ * Get GPU instances for given instance ID. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param device The identifier of the target device ++ * @param id The GPU instance ID ++ * @param gpuInstance Returns GPU instance ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a id or \a gpuInstance are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_NOT_FOUND If the GPU instance is not found. ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceById(nvmlDevice_t device, unsigned int id, nvmlGpuInstance_t *gpuInstance); ++ ++/** ++ * Get GPU instance information. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param gpuInstance The GPU instance handle ++ * @param info Return GPU instance information ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance or \a info are invalid ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceGetInfo(nvmlGpuInstance_t gpuInstance, nvmlGpuInstanceInfo_t *info); ++ ++/** ++ * Get compute instance profile information. ++ * ++ * Information provided by this API is immutable throughout the lifetime of a MIG mode. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param profile One of the NVML_COMPUTE_INSTANCE_PROFILE_* ++ * @param engProfile One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_* ++ * @param info Returns detailed profile information ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a engProfile or \a info are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a profile isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfo(nvmlGpuInstance_t gpuInstance, unsigned int profile, ++ unsigned int engProfile, ++ nvmlComputeInstanceProfileInfo_t *info); ++ ++/** ++ * Versioned wrapper around \ref nvmlGpuInstanceGetComputeInstanceProfileInfo that accepts a versioned ++ * \ref nvmlComputeInstanceProfileInfo_v2_t or later output structure. ++ * ++ * @note The caller must set the \ref nvmlGpuInstanceProfileInfo_v2_t.version field to the ++ * appropriate version prior to calling this function. For example: ++ * \code ++ * nvmlComputeInstanceProfileInfo_v2_t profileInfo = ++ * { .version = nvmlComputeInstanceProfileInfo_v2 }; ++ * nvmlReturn_t result = nvmlGpuInstanceGetComputeInstanceProfileInfoV(gpuInstance, ++ * profile, ++ * engProfile, ++ * &profileInfo); ++ * \endcode ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param profile One of the NVML_COMPUTE_INSTANCE_PROFILE_* ++ * @param engProfile One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_* ++ * @param info Returns detailed profile information ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a engProfile, \a info, or \a info->version are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a profile isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfoV(nvmlGpuInstance_t gpuInstance, unsigned int profile, ++ unsigned int engProfile, ++ nvmlComputeInstanceProfileInfo_v2_t *info); ++ ++/** ++ * Get compute instance profile capacity. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param profileId The compute instance profile ID. ++ * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo ++ * @param count Returns remaining instance count for the profile ID ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId or \a availableCount are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuInstance_t gpuInstance, ++ unsigned int profileId, unsigned int *count); ++ ++/** ++ * Get compute instance placements. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * A placement represents the location of a compute instance within a GPU instance. This API only returns all the possible ++ * placements for the given profile. ++ * A created compute instance occupies compute slices described by its placement. Creation of new compute instance will ++ * fail if there is overlap with the already occupied compute slices. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param profileId The compute instance profile ID. See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo ++ * @param placements Returns placements allowed for the profile. Can be NULL to discover number ++ * of allowed placements for this profile. If non-NULL must be large enough ++ * to accommodate the placements supported by the profile. ++ * @param count Returns number of allowed placemenets for the profile. ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId or \a count are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstancePossiblePlacements(nvmlGpuInstance_t gpuInstance, ++ unsigned int profileId, ++ nvmlComputeInstancePlacement_t *placements, ++ unsigned int *count); ++ ++/** ++ * Create compute instance. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * If the parent device is unbound, reset or the parent GPU instance is destroyed or the compute instance is destroyed ++ * explicitly, the compute instance handle would become invalid. The compute instance must be recreated to acquire ++ * a valid handle. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param profileId The compute instance profile ID. ++ * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo ++ * @param computeInstance Returns the compute instance handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a profileId or \a computeInstance ++ * are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested compute instance could not be created ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstance(nvmlGpuInstance_t gpuInstance, unsigned int profileId, ++ nvmlComputeInstance_t *computeInstance); ++ ++/** ++ * Create compute instance with the specified placement. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * If the parent device is unbound, reset or the parent GPU instance is destroyed or the compute instance is destroyed ++ * explicitly, the compute instance handle would become invalid. The compute instance must be recreated to acquire ++ * a valid handle. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param profileId The compute instance profile ID. ++ * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo ++ * @param placement The requested placement. See \ref nvmlGpuInstanceGetComputeInstancePossiblePlacements ++ * @param computeInstance Returns the compute instance handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a profileId or \a computeInstance ++ * are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested compute instance could not be created ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstanceWithPlacement(nvmlGpuInstance_t gpuInstance, unsigned int profileId, ++ const nvmlComputeInstancePlacement_t *placement, ++ nvmlComputeInstance_t *computeInstance); ++ ++/** ++ * Destroy compute instance. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param computeInstance The compute instance handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance is invalid ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_IN_USE If the compute instance is in use. This error would be returned if ++ * processes (e.g. CUDA application) are active on the compute instance. ++ */ ++nvmlReturn_t DECLDIR nvmlComputeInstanceDestroy(nvmlComputeInstance_t computeInstance); ++ ++/** ++ * Get compute instances for given profile ID. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param profileId The compute instance profile ID. ++ * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo ++ * @param computeInstances Returns pre-exiting compute instances, the buffer must be large enough to ++ * accommodate the instances supported by the profile. ++ * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo ++ * @param count The count of returned compute instances ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId, \a computeInstances or \a count ++ * are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstances(nvmlGpuInstance_t gpuInstance, unsigned int profileId, ++ nvmlComputeInstance_t *computeInstances, unsigned int *count); ++ ++/** ++ * Get compute instance for given instance ID. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * Requires privileged user. ++ * ++ * @param gpuInstance The identifier of the target GPU instance ++ * @param id The compute instance ID ++ * @param computeInstance Returns compute instance ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a ID or \a computeInstance are invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ * - \ref NVML_ERROR_NOT_FOUND If the compute instance is not found. ++ */ ++nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance, unsigned int id, ++ nvmlComputeInstance_t *computeInstance); ++ ++/** ++ * Get compute instance information. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param computeInstance The compute instance handle ++ * @param info Return compute instance information ++ * ++ * @return ++ * - \ref NVML_SUCCESS Upon success ++ * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance or \a info are invalid ++ * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation ++ */ ++nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo_v2(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info); ++ ++/** ++ * Test if the given handle refers to a MIG device. ++ * ++ * A MIG device handle is an NVML abstraction which maps to a MIG compute instance. ++ * These overloaded references can be used (with some restrictions) interchangeably ++ * with a GPU device handle to execute queries at a per-compute instance granularity. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device NVML handle to test ++ * @param isMigDevice True when handle refers to a MIG device ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device status was successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle or \a isMigDevice reference is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int *isMigDevice); ++ ++/** ++ * Get GPU instance ID for the given MIG device handle. ++ * ++ * GPU instance IDs are unique per device and remain valid until the GPU instance is destroyed. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device Target MIG device handle ++ * @param id GPU instance ID ++ * ++ * @return ++ * - \ref NVML_SUCCESS if instance ID was successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int *id); ++ ++/** ++ * Get compute instance ID for the given MIG device handle. ++ * ++ * Compute instance IDs are unique per GPU instance and remain valid until the compute instance ++ * is destroyed. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device Target MIG device handle ++ * @param id Compute instance ID ++ * ++ * @return ++ * - \ref NVML_SUCCESS if instance ID was successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int *id); ++ ++/** ++ * Get the maximum number of MIG devices that can exist under a given parent NVML device. ++ * ++ * Returns zero if MIG is not supported or enabled. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device Target device handle ++ * @param count Count of MIG devices ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a count was successfully retrieved ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a count reference is invalid ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int *count); ++ ++/** ++ * Get MIG device handle for the given index under its parent NVML device. ++ * ++ * If the compute instance is destroyed either explicitly or by destroying, ++ * resetting or unbinding the parent GPU instance or the GPU device itself ++ * the MIG device handle would remain invalid and must be requested again ++ * using this API. Handles may be reused and their properties can change in ++ * the process. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param device Reference to the parent GPU device handle ++ * @param index Index of the MIG device ++ * @param migDevice Reference to the MIG device handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a migDevice handle was successfully created ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a index or \a migDevice reference is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_NOT_FOUND if no valid MIG device was found at \a index ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned int index, ++ nvmlDevice_t *migDevice); ++ ++/** ++ * Get parent device handle from a MIG device handle. ++ * ++ * For Ampere &tm; or newer fully supported devices. ++ * Supported on Linux only. ++ * ++ * @param migDevice MIG device handle ++ * @param device Device handle ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a device handle was successfully created ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a migDevice or \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice, nvmlDevice_t *device); ++ ++/** @} */ // @defgroup nvmlMultiInstanceGPU ++ ++ ++/***************************************************************************************************/ ++/** @defgroup GPM NVML GPM ++ * @{ ++ */ ++/***************************************************************************************************/ ++/** @defgroup nvmlGpmEnums GPM Enums ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * GPM Metric Identifiers ++ */ ++typedef enum ++{ ++ NVML_GPM_METRIC_GRAPHICS_UTIL = 1, //!< Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0 ++ NVML_GPM_METRIC_SM_UTIL = 2, //!< Percentage of SMs that were busy. 0.0 - 100.0 ++ NVML_GPM_METRIC_SM_OCCUPANCY = 3, //!< Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0 ++ NVML_GPM_METRIC_INTEGER_UTIL = 4, //!< Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0 ++ NVML_GPM_METRIC_ANY_TENSOR_UTIL = 5, //!< Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0 ++ NVML_GPM_METRIC_DFMA_TENSOR_UTIL = 6, //!< Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0 ++ NVML_GPM_METRIC_HMMA_TENSOR_UTIL = 7, //!< Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0 ++ NVML_GPM_METRIC_IMMA_TENSOR_UTIL = 9, //!< Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0 ++ NVML_GPM_METRIC_DRAM_BW_UTIL = 10, //!< Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0 */ ++ NVML_GPM_METRIC_FP64_UTIL = 11, //!< Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0 ++ NVML_GPM_METRIC_FP32_UTIL = 12, //!< Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0 ++ NVML_GPM_METRIC_FP16_UTIL = 13, //!< Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0 ++ NVML_GPM_METRIC_PCIE_TX_PER_SEC = 20, //!< PCIe traffic from this GPU in MiB/sec ++ NVML_GPM_METRIC_PCIE_RX_PER_SEC = 21, //!< PCIe traffic to this GPU in MiB/sec ++ NVML_GPM_METRIC_NVDEC_0_UTIL = 30, //!< Percent utilization of NVDEC 0. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVDEC_1_UTIL = 31, //!< Percent utilization of NVDEC 1. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVDEC_2_UTIL = 32, //!< Percent utilization of NVDEC 2. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVDEC_3_UTIL = 33, //!< Percent utilization of NVDEC 3. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVDEC_4_UTIL = 34, //!< Percent utilization of NVDEC 4. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVDEC_5_UTIL = 35, //!< Percent utilization of NVDEC 5. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVDEC_6_UTIL = 36, //!< Percent utilization of NVDEC 6. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVDEC_7_UTIL = 37, //!< Percent utilization of NVDEC 7. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_0_UTIL = 40, //!< Percent utilization of NVJPG 0. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_1_UTIL = 41, //!< Percent utilization of NVJPG 1. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_2_UTIL = 42, //!< Percent utilization of NVJPG 2. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_3_UTIL = 43, //!< Percent utilization of NVJPG 3. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_4_UTIL = 44, //!< Percent utilization of NVJPG 4. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_5_UTIL = 45, //!< Percent utilization of NVJPG 5. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_6_UTIL = 46, //!< Percent utilization of NVJPG 6. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVJPG_7_UTIL = 47, //!< Percent utilization of NVJPG 7. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVOFA_0_UTIL = 50, //!< Percent utilization of NVOFA 0. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVOFA_1_UTIL = 51, //!< Percent utilization of NVOFA 1. 0.0 - 100.0 ++ NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC = 60, //!< NvLink read bandwidth for all links in MiB/sec ++ NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC = 61, //!< NvLink write bandwidth for all links in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC = 62, //!< NvLink read bandwidth for link 0 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC = 63, //!< NvLink write bandwidth for link 0 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC = 64, //!< NvLink read bandwidth for link 1 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC = 65, //!< NvLink write bandwidth for link 1 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC = 66, //!< NvLink read bandwidth for link 2 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC = 67, //!< NvLink write bandwidth for link 2 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC = 68, //!< NvLink read bandwidth for link 3 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC = 69, //!< NvLink write bandwidth for link 3 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC = 70, //!< NvLink read bandwidth for link 4 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC = 71, //!< NvLink write bandwidth for link 4 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC = 72, //!< NvLink read bandwidth for link 5 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC = 73, //!< NvLink write bandwidth for link 5 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC = 74, //!< NvLink read bandwidth for link 6 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC = 75, //!< NvLink write bandwidth for link 6 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC = 76, //!< NvLink read bandwidth for link 7 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC = 77, //!< NvLink write bandwidth for link 7 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC = 78, //!< NvLink read bandwidth for link 8 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC = 79, //!< NvLink write bandwidth for link 8 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC = 80, //!< NvLink read bandwidth for link 9 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC = 81, //!< NvLink write bandwidth for link 9 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC = 82, //!< NvLink read bandwidth for link 10 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC = 83, //!< NvLink write bandwidth for link 10 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC = 84, //!< NvLink read bandwidth for link 11 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC = 85, //!< NvLink write bandwidth for link 11 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC = 86, //!< NvLink read bandwidth for link 12 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC = 87, //!< NvLink write bandwidth for link 12 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC = 88, //!< NvLink read bandwidth for link 13 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC = 89, //!< NvLink write bandwidth for link 13 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC = 90, //!< NvLink read bandwidth for link 14 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC = 91, //!< NvLink write bandwidth for link 14 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC = 92, //!< NvLink read bandwidth for link 15 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC = 93, //!< NvLink write bandwidth for link 15 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC = 94, //!< NvLink read bandwidth for link 16 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC = 95, //!< NvLink write bandwidth for link 16 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC = 96, //!< NvLink read bandwidth for link 17 in MiB/sec ++ NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC = 97, //!< NvLink write bandwidth for link 17 in MiB/sec ++ //Put new metrics for BLACKWELL here... ++ NVML_GPM_METRIC_MAX = 98, //!< Maximum value above +1. Note that changing this should also change NVML_GPM_METRICS_GET_VERSION due to struct size change ++} nvmlGpmMetricId_t; ++ ++/** @} */ // @defgroup nvmlGpmEnums ++ ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlGpmStructs GPM Structs ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Handle to an allocated GPM sample allocated with nvmlGpmSampleAlloc(). Free this with nvmlGpmSampleFree(). ++ */ ++typedef struct nvmlGpmSample_st* nvmlGpmSample_t; ++ ++/** ++ * GPM metric information. ++ */ ++typedef struct ++{ ++ unsigned int metricId; //!< IN: NVML_GPM_METRIC_? define of which metric to retrieve ++ nvmlReturn_t nvmlReturn; //!< OUT: Status of this metric. If this is nonzero, then value is not valid ++ double value; //!< OUT: Value of this metric. Is only valid if nvmlReturn is 0 (NVML_SUCCESS) ++ struct ++ { ++ char *shortName; ++ char *longName; ++ char *unit; ++ } metricInfo; //!< OUT: Metric name and unit. Those can be NULL if not defined ++} nvmlGpmMetric_t; ++ ++/** ++ * GPM buffer information. ++ */ ++typedef struct ++{ ++ unsigned int version; //!< IN: Set to NVML_GPM_METRICS_GET_VERSION ++ unsigned int numMetrics; //!< IN: How many metrics to retrieve in metrics[] ++ nvmlGpmSample_t sample1; //!< IN: Sample buffer ++ nvmlGpmSample_t sample2; //!< IN: Sample buffer ++ nvmlGpmMetric_t metrics[NVML_GPM_METRIC_MAX]; //!< IN/OUT: Array of metrics. Set metricId on call. See nvmlReturn and value on return ++} nvmlGpmMetricsGet_t; ++ ++#define NVML_GPM_METRICS_GET_VERSION 1 ++ ++/** ++ * GPM device information. ++ */ ++typedef struct ++{ ++ unsigned int version; //!< IN: Set to NVML_GPM_SUPPORT_VERSION ++ unsigned int isSupportedDevice; //!< OUT: Indicates device support ++} nvmlGpmSupport_t; ++ ++#define NVML_GPM_SUPPORT_VERSION 1 ++ ++/** @} */ // @defgroup nvmlGPMStructs ++ ++/***************************************************************************************************/ ++/** @defgroup nvmlGpmFunctions GPM Functions ++ * @{ ++ */ ++/***************************************************************************************************/ ++ ++/** ++ * Calculate GPM metrics from two samples. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * @param metricsGet IN/OUT: populated \a nvmlGpmMetricsGet_t struct ++ * ++ * @return ++ * - \ref NVML_SUCCESS on success ++ * - Nonzero NVML_ERROR_? enum on error ++ */ ++nvmlReturn_t DECLDIR nvmlGpmMetricsGet(nvmlGpmMetricsGet_t *metricsGet); ++ ++ ++/** ++ * Free an allocated sample buffer that was allocated with \ref nvmlGpmSampleAlloc() ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * @param gpmSample Sample to free ++ * ++ * @return ++ * - \ref NVML_SUCCESS on success ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided ++ */ ++nvmlReturn_t DECLDIR nvmlGpmSampleFree(nvmlGpmSample_t gpmSample); ++ ++ ++/** ++ * Allocate a sample buffer to be used with NVML GPM . You will need to allocate ++ * at least two of these buffers to use with the NVML GPM feature ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * @param gpmSample Where the allocated sample will be stored ++ * ++ * @return ++ * - \ref NVML_SUCCESS on success ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided ++ * - \ref NVML_ERROR_MEMORY if system memory is insufficient ++ */ ++nvmlReturn_t DECLDIR nvmlGpmSampleAlloc(nvmlGpmSample_t *gpmSample); ++ ++/** ++ * Read a sample of GPM metrics into the provided \a gpmSample buffer. After ++ * two samples are gathered, you can call nvmlGpmMetricGet on those samples to ++ * retrive metrics ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * @param device Device to get samples for ++ * @param gpmSample Buffer to read samples into ++ * ++ * @return ++ * - \ref NVML_SUCCESS on success ++ * - Nonzero NVML_ERROR_? enum on error ++ */ ++nvmlReturn_t DECLDIR nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSample); ++ ++/** ++ * Read a sample of GPM metrics into the provided \a gpmSample buffer for a MIG GPU Instance. ++ * ++ * After two samples are gathered, you can call nvmlGpmMetricGet on those ++ * samples to retrive metrics ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * ++ * @param device Device to get samples for ++ * @param gpuInstanceId MIG GPU Instance ID ++ * @param gpmSample Buffer to read samples into ++ * ++ * @return ++ * - \ref NVML_SUCCESS on success ++ * - Nonzero NVML_ERROR_? enum on error ++ */ ++nvmlReturn_t DECLDIR nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuInstanceId, nvmlGpmSample_t gpmSample); ++ ++/** ++ * Indicate whether the supplied device supports GPM ++ * ++ * @param device NVML device to query for ++ * @param gpmSupport Structure to indicate GPM support \a nvmlGpmSupport_t. Indicates ++ * GPM support per system for the supplied device ++ * ++ * @return ++ * - NVML_SUCCESS on success ++ * - Nonzero NVML_ERROR_? enum if there is an error in processing the query ++ */ ++nvmlReturn_t DECLDIR nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t *gpmSupport); ++ ++/* GPM Stream State */ ++/** ++ * Get GPM stream state. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param device The identifier of the target device ++ * @param state Returns GPM stream state ++ * NVML_FEATURE_DISABLED or NVML_FEATURE_ENABLED ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a current GPM stream state were successfully queried ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a state is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlGpmQueryIfStreamingEnabled(nvmlDevice_t device, unsigned int *state); ++ ++/** ++ * Set GPM stream state. ++ * ++ * For Hopper &tm; or newer fully supported devices. ++ * Supported on Linux, Windows TCC. ++ * ++ * @param device The identifier of the target device ++ * @param state GPM stream state, ++ * NVML_FEATURE_DISABLED or NVML_FEATURE_ENABLED ++ * ++ * @return ++ * - \ref NVML_SUCCESS if \a current GPM stream state is successfully set ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid ++ * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device ++ */ ++nvmlReturn_t DECLDIR nvmlGpmSetStreamingEnabled(nvmlDevice_t device, unsigned int state); ++ ++/** @} */ // @defgroup nvmlGpmFunctions ++/** @} */ // @defgroup GPM ++ ++#define NVML_DEV_CAP_EGM (1 << 0) // Extended GPU memory ++/** ++ * Device capabilities ++ */ ++typedef struct ++{ ++ unsigned int version; //!< the API version number ++ unsigned int capMask; //!< OUT: Bit mask of capabilities. ++} nvmlDeviceCapabilities_v1_t; ++typedef nvmlDeviceCapabilities_v1_t nvmlDeviceCapabilities_t; ++#define nvmlDeviceCapabilities_v1 NVML_STRUCT_VERSION(DeviceCapabilities, 1) ++ ++/** ++ * Get device capabilities ++ * ++ * See \ref nvmlDeviceCapabilities_v1_t for more information on the struct. ++ * ++ * @param device The identifier of the target device ++ * @param caps Returns GPU's capabilities ++ * ++ * @return ++ * - \ref NVML_SUCCESS if the query is success ++ * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized ++ * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counters is NULL ++ * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature ++ * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible ++ * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the provided version is invalid/unsupported ++ * - \ref NVML_ERROR_UNKNOWN on any unexpected error ++ */ ++nvmlReturn_t DECLDIR nvmlDeviceGetCapabilities(nvmlDevice_t device, ++ nvmlDeviceCapabilities_t *caps); ++ ++/** ++ * NVML API versioning support ++ */ ++ ++#ifdef NVML_NO_UNVERSIONED_FUNC_DEFS ++nvmlReturn_t DECLDIR nvmlInit(void); ++nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount); ++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); ++nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device); ++nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci); ++nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v2(nvmlDevice_t device, nvmlPciInfo_t *pci); ++nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); ++nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); ++nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v2(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); ++nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v3(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); ++nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu(nvmlPciInfo_t *pciInfo); ++nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); ++nvmlReturn_t DECLDIR nvmlDeviceGetAttributes(nvmlDevice_t device, nvmlDeviceAttributes_t *attributes); ++nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info); ++nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); ++nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos); ++nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); ++nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos); ++nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); ++nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos); ++nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstancePlacement_t *placements, unsigned int *count); ++nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseInfo(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuLicenseInfo_t *licenseInfo); ++nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); ++#endif // #ifdef NVML_NO_UNVERSIONED_FUNC_DEFS ++ ++#if defined(NVML_NO_UNVERSIONED_FUNC_DEFS) ++// We don't define APIs to run new versions if this guard is present so there is ++// no need to undef ++#elif defined(__NVML_API_VERSION_INTERNAL) ++#undef nvmlDeviceGetGraphicsRunningProcesses ++#undef nvmlDeviceGetComputeRunningProcesses ++#undef nvmlDeviceGetMPSComputeRunningProcesses ++#undef nvmlDeviceGetAttributes ++#undef nvmlComputeInstanceGetInfo ++#undef nvmlEventSetWait ++#undef nvmlDeviceGetGridLicensableFeatures ++#undef nvmlDeviceRemoveGpu ++#undef nvmlDeviceGetNvLinkRemotePciInfo ++#undef nvmlDeviceGetPciInfo ++#undef nvmlDeviceGetCount ++#undef nvmlDeviceGetHandleByIndex ++#undef nvmlDeviceGetHandleByPciBusId ++#undef nvmlInit ++#undef nvmlBlacklistDeviceInfo_t ++#undef nvmlGetBlacklistDeviceCount ++#undef nvmlGetBlacklistDeviceInfoByIndex ++#undef nvmlDeviceGetGpuInstancePossiblePlacements ++#undef nvmlVgpuInstanceGetLicenseInfo ++#undef nvmlDeviceGetDriverModel ++#undef nvmlDeviceSetPowerManagementLimit ++ ++#endif ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif +diff --git a/contrib/nvml.py b/contrib/nvml.py +index 9f2c57d..2516979 100644 +--- a/contrib/nvml.py ++++ b/contrib/nvml.py +@@ -1,6 +1,7 @@ + import re ++import os + +-PATH="/usr/local/cuda/include/nvml.h" ++PATH=["./contrib/nvml.h", "/usr/local/cuda/include/nvml.h"] + func = ["nvmlInit", + "nvmlDeviceGetSupportedEventTypes", + "nvmlDeviceRegisterEvents", +@@ -22,7 +23,13 @@ type_pattern = re.compile( + flags=re.MULTILINE + ) + +-with open(PATH, 'r') as file: ++path="" ++if os.path.exists(PATH[0]) and os.access(PATH[0], os.R_OK): ++ path = PATH[0] ++else: ++ path = PATH[1] ++ ++with open(path, 'r') as file: + content = file.read() + matched_lines = pattern.findall(content) + type_lines = type_pattern.findall(content) +@@ -55,7 +62,7 @@ print(''' + ) + print('#include \ + \n#include \ +- \n#include "/usr/local/cuda/include/nvml.h"') ++ \n#include "{}"'.format(path)) + print('\ntypedef const char* (*my_nvmlErrorString_p)(nvmlReturn_t result);') + print('\n'.join(func_declares)) + print('\nmy_nvmlErrorString_p my_nvmlErrorString;') +-- +2.43.5 + diff --git a/1023-ras-mc-ctl-add-option-to-exclude-old-events-from-rep.patch b/1023-ras-mc-ctl-add-option-to-exclude-old-events-from-rep.patch deleted file mode 100644 index 07cfe1b4ea1941714115496c3a534289c2f432c6..0000000000000000000000000000000000000000 --- a/1023-ras-mc-ctl-add-option-to-exclude-old-events-from-rep.patch +++ /dev/null @@ -1,273 +0,0 @@ -From b052d9bad784ba10bc1281f027808ef4cb0d00eb Mon Sep 17 00:00:00 2001 -From: Marcus Sundman -Date: Thu, 20 Apr 2023 18:17:17 +0300 -Subject: [PATCH 23/85] ras-mc-ctl: add option to exclude old events from - reports - -Signed-off-by: Mauro Carvalho Chehab ---- - util/ras-mc-ctl.in | 59 +++++++++++++++++++++++++++------------------- - 1 file changed, 35 insertions(+), 24 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index e765519..13078c2 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -95,6 +95,7 @@ Usage: $prog [OPTIONS...] - --summary Presents a summary of the logged errors. - --errors Shows the errors stored at the error database. - --error-count Shows the corrected and uncorrected error counts using sysfs. -+ --since=YYYY-MM-DD Only include events since the date YYYY-MM-DD. - --vendor-errors-summary Presents a summary of the vendor-specific logged errors. - --vendor-errors Shows the vendor-specific errors stored in the error database. - --vendor-errors Shows the vendor-specific errors for a specific module stored in the error database. -@@ -175,6 +176,7 @@ sub parse_cmdline - $conf{opt}{error_count} = 0; - $conf{opt}{vendor_errors_summary} = 0; - $conf{opt}{vendor_errors} = 0; -+ $conf{opt}{since} = ''; - $conf{opt}{vendor_platforms} = 0; - - my $rref = \$conf{opt}{report}; -@@ -196,6 +198,7 @@ sub parse_cmdline - "error-count" => \$conf{opt}{error_count}, - "vendor-errors-summary" => \$conf{opt}{vendor_errors_summary}, - "vendor-errors" => \$conf{opt}{vendor_errors}, -+ "since=s" => \$conf{opt}{since}, - "vendor-platforms" => \$conf{opt}{vendor_platforms}, - ); - -@@ -207,6 +210,14 @@ sub parse_cmdline - log_error ("Only use --delay with --register-labels\n"); - exit (1); - } -+ -+ if ($conf{opt}{since}) { -+ if ($conf{opt}{since} !~ /^20\d\d-[01]\d-[0-3]\d/) { -+ log_error ("--since requires a date like yyyy-mm-dd where yyyy is the year, mm the month, and dd the day\n"); -+ exit (1); -+ } -+ $conf{opt}{since} = " where timestamp>='$conf{opt}{since}'"; -+ } - } - - sub usage -@@ -1168,7 +1179,7 @@ sub summary - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - - # Memory controller mc_event errors -- $query = "select err_type, label, mc, top_layer,middle_layer,lower_layer, count(*) from mc_event group by err_type, label, mc, top_layer, middle_layer, lower_layer"; -+ $query = "select err_type, label, mc, top_layer,middle_layer,lower_layer, count(*) from mc_event$conf{opt}{since} group by err_type, label, mc, top_layer, middle_layer, lower_layer"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($err_type, $label, $mc, $top, $mid, $low, $count)); -@@ -1185,7 +1196,7 @@ sub summary - - # PCIe AER aer_event errors - if ($has_aer == 1) { -- $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg"; -+ $query = "select err_type, err_msg, count(*) from aer_event$conf{opt}{since} group by err_type, err_msg"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($err_type, $msg, $count)); -@@ -1203,7 +1214,7 @@ sub summary - - # ARM processor arm_event errors - if ($has_arm == 1) { -- $query = "select mpidr, count(*) from arm_event group by mpidr"; -+ $query = "select mpidr, count(*) from arm_event$conf{opt}{since} group by mpidr"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($mpidr, $count)); -@@ -1221,7 +1232,7 @@ sub summary - - # extlog errors - if ($has_extlog == 1) { -- $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; -+ $query = "select etype, severity, count(*) from extlog_event$conf{opt}{since} group by etype, severity"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($etype, $severity, $count)); -@@ -1241,7 +1252,7 @@ sub summary - - # devlink errors - if ($has_devlink == 1) { -- $query = "select dev_name, count(*) from devlink_event group by dev_name"; -+ $query = "select dev_name, count(*) from devlink_event$conf{opt}{since} group by dev_name"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($dev_name, $count)); -@@ -1259,7 +1270,7 @@ sub summary - - # Disk errors - if ($has_disk_errors == 1) { -- $query = "select dev, count(*) from disk_errors group by dev"; -+ $query = "select dev, count(*) from disk_errors$conf{opt}{since} group by dev"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($dev, $count)); -@@ -1277,7 +1288,7 @@ sub summary - - # Memory failure errors - if ($has_mem_failure == 1) { -- $query = "select action_result, count(*) from memory_failure_event group by action_result"; -+ $query = "select action_result, count(*) from memory_failure_event$conf{opt}{since} group by action_result"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($action_result, $count)); -@@ -1295,7 +1306,7 @@ sub summary - - # MCE mce_record errors - if ($has_mce == 1) { -- $query = "select error_msg, count(*) from mce_record group by error_msg"; -+ $query = "select error_msg, count(*) from mce_record$conf{opt}{since} group by error_msg"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($msg, $count)); -@@ -1328,7 +1339,7 @@ sub errors - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - - # Memory controller mc_event errors -- $query = "select id, timestamp, err_count, err_type, err_msg, label, mc, top_layer,middle_layer,lower_layer, address, grain, syndrome, driver_detail from mc_event order by id"; -+ $query = "select id, timestamp, err_count, err_type, err_msg, label, mc, top_layer,middle_layer,lower_layer, address, grain, syndrome, driver_detail from mc_event$conf{opt}{since} order by id"; - $query_handle = $dbh->prepare($query); - if (!$query_handle) { - log_error ("mc_event table missing from $dbname. Run 'rasdaemon --record'.\n"); -@@ -1349,7 +1360,7 @@ sub errors - - # PCIe AER aer_event errors - if ($has_aer == 1) { -- $query = "select id, timestamp, dev_name, err_type, err_msg from aer_event order by id"; -+ $query = "select id, timestamp, dev_name, err_type, err_msg from aer_event$conf{opt}{since} order by id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $time, $devname, $type, $msg)); -@@ -1367,7 +1378,7 @@ sub errors - - # ARM processor arm_event errors - if ($has_arm == 1) { -- $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id"; -+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event$conf{opt}{since} order by id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state)); -@@ -1391,7 +1402,7 @@ sub errors - - # Extlog errors - if ($has_extlog == 1) { -- $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; -+ $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event$conf{opt}{since} order by id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data)); -@@ -1418,7 +1429,7 @@ sub errors - - # devlink errors - if ($has_devlink == 1) { -- $query = "select id, timestamp, bus_name, dev_name, driver_name, reporter_name, msg from devlink_event order by id"; -+ $query = "select id, timestamp, bus_name, dev_name, driver_name, reporter_name, msg from devlink_event$conf{opt}{since} order by id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $timestamp, $bus_name, $dev_name, $driver_name, $reporter_name, $msg)); -@@ -1442,7 +1453,7 @@ sub errors - - # Disk errors - if ($has_disk_errors == 1) { -- $query = "select id, timestamp, dev, sector, nr_sector, error, rwbs, cmd from disk_errors order by id"; -+ $query = "select id, timestamp, dev, sector, nr_sector, error, rwbs, cmd from disk_errors$conf{opt}{since} order by id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $timestamp, $dev, $sector, $nr_sector, $error, $rwbs, $cmd)); -@@ -1467,7 +1478,7 @@ sub errors - - # Memory failure errors - if ($has_mem_failure == 1) { -- $query = "select id, timestamp, pfn, page_type, action_result from memory_failure_event order by id"; -+ $query = "select id, timestamp, pfn, page_type, action_result from memory_failure_event$conf{opt}{since} order by id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $timestamp, $pfn, $page_type, $action_result)); -@@ -1486,7 +1497,7 @@ sub errors - - # MCE mce_record errors - if ($has_mce == 1) { -- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id"; -+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record$conf{opt}{since} order by id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location)); -@@ -1555,7 +1566,7 @@ sub vendor_errors_summary - # HiSilicon KunPeng9xx errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { - $found_platform = 1; -- $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id"; -+ $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2$conf{opt}{since} group by err_severity, module_id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($err_severity, $module_id, $count)); -@@ -1573,7 +1584,7 @@ sub vendor_errors_summary - } - $query_handle->finish; - -- $query = "select err_severity, module_id, count(*) from hip08_oem_type2_event_v2 group by err_severity, module_id"; -+ $query = "select err_severity, module_id, count(*) from hip08_oem_type2_event_v2$conf{opt}{since} group by err_severity, module_id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($err_severity, $module_id, $count)); -@@ -1591,7 +1602,7 @@ sub vendor_errors_summary - } - $query_handle->finish; - -- $query = "select err_severity, sub_module_id, count(*) from hip08_pcie_local_event_v2 group by err_severity, sub_module_id"; -+ $query = "select err_severity, sub_module_id, count(*) from hip08_pcie_local_event_v2$conf{opt}{since} group by err_severity, sub_module_id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($err_severity, $sub_module_id, $count)); -@@ -1609,7 +1620,7 @@ sub vendor_errors_summary - } - $query_handle->finish; - -- $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id"; -+ $query = "select err_severity, module_id, count(*) from hisi_common_section_v2$conf{opt}{since} group by err_severity, module_id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($err_severity, $module_id, $count)); -@@ -1663,7 +1674,7 @@ sub vendor_errors - # HiSilicon KunPeng9xx errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { - $found_platform = 1; -- $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity"; -+ $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2$conf{opt}{since} order by id, module_id, err_severity"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs)); -@@ -1688,7 +1699,7 @@ sub vendor_errors - } - $query_handle->finish; - -- $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type2_event_v2 order by id, module_id, err_severity"; -+ $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type2_event_v2$conf{opt}{since} order by id, module_id, err_severity"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs)); -@@ -1713,7 +1724,7 @@ sub vendor_errors - } - $query_handle->finish; - -- $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, sub_module_id, core_id, port_id, err_severity, err_type, regs_dump from hip08_pcie_local_event_v2 order by id, sub_module_id, err_severity"; -+ $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, sub_module_id, core_id, port_id, err_severity, err_type, regs_dump from hip08_pcie_local_event_v2$conf{opt}{since} order by id, sub_module_id, err_severity"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $sub_module_id, $core_id, $port_id, $err_severity, $err_type, $regs)); -@@ -1740,7 +1751,7 @@ sub vendor_errors - } - $query_handle->finish; - -- $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity"; -+ $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2$conf{opt}{since} order by id, module_id, err_severity"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); - $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs)); --- -2.33.1 - diff --git a/1024-anolis-do-not-print-teq-error.patch b/1024-anolis-do-not-print-teq-error.patch new file mode 100644 index 0000000000000000000000000000000000000000..4fbe782120b8c768ea8e0a55945f60751fd50adc --- /dev/null +++ b/1024-anolis-do-not-print-teq-error.patch @@ -0,0 +1,50 @@ +From c6a9ca106c41e1f351849bce5d491bba3813cc10 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Thu, 17 Apr 2025 17:26:48 +0800 +Subject: [PATCH 24/30] anolis: do not print teq error + +Signed-off-by: Ruidong Tian +--- + ras-cxl-handler.c | 2 +- + ras-mce-handler.c | 6 +++--- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 575fff8..55509f1 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -718,7 +718,7 @@ static int handle_ras_cxl_common_hdr(struct trace_seq *s, + if (trace_seq_printf(s, "hdr_maint_op_class:%u ", hdr->hdr_maint_op_class) <= 0) + return -1; + +- if (tep_get_field_val(s, event, "hdr_maint_op_sub_class", record, &val, 1) < 0) ++ if (tep_get_field_val(s, event, "hdr_maint_op_sub_class", record, &val, 0) < 0) + return -1; + hdr->hdr_maint_op_sub_class = val; + if (trace_seq_printf(s, "hdr_maint_op_sub_class:%u ", hdr->hdr_maint_op_sub_class) <= 0) +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index fc2e8d4..0f0d37f 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -571,15 +571,15 @@ int ras_mce_event_handler(struct trace_seq *s, + e.ipid = val; + + /* Get PPIN */ +- if (!tep_get_field_val(s, event, "ppin", record, &val, 1)) ++ if (!tep_get_field_val(s, event, "ppin", record, &val, 0)) + e.ppin = val; + + /* Get Microcode Revision */ +- if (!tep_get_field_val(s, event, "microcode", record, &val, 1)) ++ if (!tep_get_field_val(s, event, "microcode", record, &val, 0)) + e.microcode = val; + + /* Get Vendor-specfic Data, if any */ +- e.vdata = tep_get_field_raw(s, event, "v_data", record, &e.vdata_len, 1); ++ e.vdata = tep_get_field_raw(s, event, "v_data", record, &e.vdata_len, 0); + + switch (mce->cputype) { + case CPU_GENERIC: +-- +2.43.5 + diff --git a/1024-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch b/1024-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch deleted file mode 100644 index 6243b59afb0775735c02e7d668f1c83e36599fc7..0000000000000000000000000000000000000000 --- a/1024-rasdaemon-ras-mc-ctl-Add-support-to-display-the-THea.patch +++ /dev/null @@ -1,123 +0,0 @@ -From 2a202d970dfc76e26b5d423fc10572fd0dd80164 Mon Sep 17 00:00:00 2001 -From: Ruidong Tian -Date: Thu, 7 Sep 2023 18:22:06 +0800 -Subject: [PATCH 24/85] rasdaemon: ras-mc-ctl: Add support to display the THead - vendor errors - -Add support for the THead YiTian DDRC register dump event. - -Signed-off-by: Ruidong Tian -Signed-off-by: Mauro Carvalho Chehab ---- - util/ras-mc-ctl.in | 48 ++++++++++++++++++++++++++++++++++++++++++++-- - 1 file changed, 46 insertions(+), 2 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 13078c2..5d8b46c 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1541,6 +1541,7 @@ sub errors - # Definitions of the vendor platform IDs. - use constant { - HISILICON_KUNPENG_9XX => "KunPeng9xx", -+ THEAD_YITIAN_7XX => "YiTian7XX", - }; - - sub vendor_errors_summary -@@ -1549,6 +1550,7 @@ sub vendor_errors_summary - my ($num_args, $platform_id, $found_platform); - my ($query, $query_handle, $count, $out); - my ($module_id, $sub_module_id, $err_severity, $err_sev); -+ my ($address); - - $num_args = $#ARGV + 1; - $platform_id = 0; -@@ -1639,6 +1641,25 @@ sub vendor_errors_summary - $query_handle->finish; - } - -+ # THead Yitian710 DDR errors -+ if ($platform_id eq THEAD_YITIAN_7XX) { -+ $found_platform = 1; -+ $query = "select address, count(*) from yitian_ddr_reg_dump_event"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($address, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\terrors: $count"; -+ } -+ if ($out ne "") { -+ print "THead YiTian710 DDR error dump events summary:\n$out\n"; -+ } else { -+ print "No THead YiTian710 DDR error dump errors.\n\n"; -+ } -+ $query_handle->finish; -+ } -+ - if ($platform_id && !($found_platform)) { - print "Platform ID $platform_id is not valid\n"; - } -@@ -1653,6 +1674,7 @@ sub vendor_errors - my ($query, $query_handle, $id, $timestamp, $out); - my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id); - my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs); -+ my ($address, $regs_dump); - - $num_args = $#ARGV + 1; - $platform_id = 0; -@@ -1673,7 +1695,7 @@ sub vendor_errors - - # HiSilicon KunPeng9xx errors - if ($platform_id eq HISILICON_KUNPENG_9XX) { -- $found_platform = 1; -+ $found_platform = 1; - $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2$conf{opt}{since} order by id, module_id, err_severity"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -@@ -1783,12 +1805,33 @@ sub vendor_errors - $query_handle->finish; - } - -+ # THead Yitian7xx ddr errors -+ if ($platform_id eq THEAD_YITIAN_7XX) { -+ $found_platform = 1; -+ $query = "select id, timestamp, address, regs_dump from yitian_ddr_reg_dump_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $address, $regs_dump)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id. $timestamp "; -+ $out .= "Error Address: $address "; -+ $out .= "Error Registers Dump: $regs_dump" if ($regs_dump); -+ $out .= "\n\n"; -+ } -+ if ($out ne "") { -+ print "THead Yitian710 DDRC error events:\n$out\n"; -+ } else { -+ print "No THead Yitian710 DDRC error events.\n"; -+ } -+ $query_handle->finish; -+ } -+ - if ($platform_id && !($found_platform)) { - print "Platform ID $platform_id is not valid\n"; - } elsif ($module && !($found_module)) { - print "No error record for the module $module\n"; - } -- - undef($dbh); - } - -@@ -1796,6 +1839,7 @@ sub vendor_platforms - { - print "\nSupported platforms for the vendor-specific errors:\n"; - print "\tHiSilicon KunPeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n"; -+ print "\tTHead Yitian7xx, platform-id=\"", THEAD_YITIAN_7XX, "\"\n"; - print "\n"; - } - --- -2.33.1 - diff --git a/1025-anolis-add-init.sh-for-different-user.patch b/1025-anolis-add-init.sh-for-different-user.patch new file mode 100644 index 0000000000000000000000000000000000000000..90dd4af91b6a349ef2135d5b1b12e3753b974ef5 --- /dev/null +++ b/1025-anolis-add-init.sh-for-different-user.patch @@ -0,0 +1,104 @@ +From bec7414b742dc7164d7674a0eb9489c4723514ab Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 18 Apr 2025 15:43:57 +0800 +Subject: [PATCH 25/30] anolis: add init.sh for different user + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 1 + + contrib/rasdaemon.init | 26 ++++++++++++++++++++++++++ + misc/rasdaemon.spec.in | 18 ++++++++++++------ + 3 files changed, 39 insertions(+), 6 deletions(-) + create mode 100644 contrib/rasdaemon.init + +diff --git a/Makefile.am b/Makefile.am +index 4aba962..203b576 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -24,6 +24,7 @@ EXTRA_DIST = \ + $(RSYSLOG_EXT_SERVICES_IN) \ + misc/rasdaemon.env \ + misc/notices \ ++ contrib/rasdaemon.init \ + contrib/nvml.py \ + contrib/nvml.h \ + contrib/*_trigger +diff --git a/contrib/rasdaemon.init b/contrib/rasdaemon.init +new file mode 100644 +index 0000000..d575af9 +--- /dev/null ++++ b/contrib/rasdaemon.init +@@ -0,0 +1,26 @@ ++#!/bin/sh ++target=$1 ++ENV_PATH="/etc/sysconfig/rasdaemon" ++ ++case "$target" in ++ ecs) ++ sed -i 's/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION="soft"/g' ${ENV_PATH} ++ ;; ++ ebs) ++ sed -i 's/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION="soft"/g' ${ENV_PATH} ++ sed -i 's/^PAGE_CE_THRESHOLD=.*/PAGE_CE_THRESHOLD="10"/g' ${ENV_PATH} ++ sed -i 's/^TRIGGER_DIR=.*/TRIGGER_DIR="\/etc\/ras\/triggers"/g' ${ENV_PATH} ++ sed -i 's/^PRE_PAGE_OFFLINE_TRIGGER=.*/PRE_PAGE_OFFLINE_TRIGGER="page_offline_pre_trigger"/g' ${ENV_PATH} ++ sed -i 's/^POST_PAGE_OFFLINE_TRIGGER=.*/POST_PAGE_OFFLINE_TRIGGER="page_offline_post_trigger"/g' ${ENV_PATH} ++ ;; ++ jituan) ++ sed -i 's/json_report,kmsg_monitor,//' ${ENV_PATH} ++ sed -i 's/^AMDGPU_MCA_ENABLED=.*/AMDGPU_MCA_ENABLED=1/g' ${ENV_PATH} ++ exit 1 ++ ;; ++ zhuanyou) ++ sed -i 's/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION="soft"/g' ${ENV_PATH} ++ sed -i 's/^PAGE_CE_THRESHOLD=.*/PAGE_CE_THRESHOLD="10"/g' ${ENV_PATH} ++ ;; ++ ++esac +\ No newline at end of file +diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in +index 23be188..bf4cc4b 100644 +--- a/misc/rasdaemon.spec.in ++++ b/misc/rasdaemon.spec.in +@@ -61,6 +61,7 @@ install -D -p -m 0655 misc/%{name}.rsyslog-ext %{buildroot}/usr/share/%{name}/%{ + install -D -p -m 0655 misc/%{name}.syslog-ng-ext %{buildroot}/usr/share/%{name}/%{name}.syslog-ng-ext + install -d %{buildroot}%{_sysconfdir}/rasdaemon_notices/ + install -D -p -m 0755 misc/notices/* %{buildroot}%{_sysconfdir}/rasdaemon_notices/ ++install -D -p -m 0755 contrib/%{name}.init %{buildroot}/usr/share/%{name}/%{name}.init + rm INSTALL %{buildroot}/usr/include/*.h + + %files +@@ -71,12 +72,13 @@ rm INSTALL %{buildroot}/usr/include/*.h + %{_unitdir}/*.service + %{_sysconfdir}/ras/dimm_labels.d + %{_sysconfdir}/ras/*/* +-%config(noreplace) %{_sysconfdir}/sysconfig/%{name} +-%config(noreplace) /usr/share/%{name}/%{name}.syslog-ng +-%config(noreplace) /usr/share/%{name}/%{name}.logrotate +-%config(noreplace) /usr/share/%{name}/%{name}.rsyslog +-%config(noreplace) /usr/share/%{name}/%{name}.syslog-ng-ext +-%config(noreplace) /usr/share/%{name}/%{name}.rsyslog-ext ++%{_sysconfdir}/sysconfig/%{name} ++/usr/share/%{name}/%{name}.syslog-ng ++/usr/share/%{name}/%{name}.logrotate ++/usr/share/%{name}/%{name}.rsyslog ++/usr/share/%{name}/%{name}.syslog-ng-ext ++/usr/share/%{name}/%{name}.rsyslog-ext ++/usr/share/%{name}/%{name}.init + %{_sysconfdir}/rasdaemon_notices/* + + %post +@@ -104,6 +106,10 @@ if ! systemctl is-enabled --quiet %{name}.service; then + echo "Rasdaemon service is not enabled, enable it"; + systemctl enable %{name}.service; + fi ++echo "Rasdaemon install for ${RASDAEMON_TARGET}"; ++/usr/share/%{name}/%{name}.init ${RASDAEMON_TARGET} ++ ++systemctl daemon-reload + systemctl restart %{name}.service + + %preun +-- +2.43.5 + diff --git a/1025-rasdaemon-Add-Emerald-Rapids-support.patch b/1025-rasdaemon-Add-Emerald-Rapids-support.patch deleted file mode 100644 index 8c8534f4a4b902bf3358da825594afb92bc1f3e7..0000000000000000000000000000000000000000 --- a/1025-rasdaemon-Add-Emerald-Rapids-support.patch +++ /dev/null @@ -1,74 +0,0 @@ -From 2f9f335ff3a7c70d87b435e43df775e3a73606a7 Mon Sep 17 00:00:00 2001 -From: "Delgado Vargas, Daniel" -Date: Fri, 20 Oct 2023 10:57:11 -0600 -Subject: [PATCH 25/85] rasdaemon: Add Emerald Rapids support - -Signed-off-by: Delgado Vargas, Daniel -Signed-off-by: Mauro Carvalho Chehab ---- - mce-intel-i10nm.c | 1 + - mce-intel.c | 1 + - ras-mce-handler.c | 3 +++ - ras-mce-handler.h | 1 + - 4 files changed, 6 insertions(+) - -diff --git a/mce-intel-i10nm.c b/mce-intel-i10nm.c -index 3c5d22f..c4ace56 100644 ---- a/mce-intel-i10nm.c -+++ b/mce-intel-i10nm.c -@@ -380,6 +380,7 @@ void i10nm_decode_model(enum cputype cputype, struct ras_events *ras, - banktype = tremont[e->bank]; - break; - case CPU_SAPPHIRERAPIDS: -+ case CPU_EMERALDRAPIDS: - banktype = sapphire[e->bank]; - break; - default: -diff --git a/mce-intel.c b/mce-intel.c -index e083e9c..18a9072 100644 ---- a/mce-intel.c -+++ b/mce-intel.c -@@ -415,6 +415,7 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e) - case CPU_ICELAKE_DE: - case CPU_TREMONT_D: - case CPU_SAPPHIRERAPIDS: -+ case CPU_EMERALDRAPIDS: - i10nm_decode_model(mce->cputype, ras, e); - default: - break; -diff --git a/ras-mce-handler.c b/ras-mce-handler.c -index d09829d..370e68a 100644 ---- a/ras-mce-handler.c -+++ b/ras-mce-handler.c -@@ -61,6 +61,7 @@ static char *cputype_name[] = { - [CPU_ICELAKE_DE] = "Icelake server D Family", - [CPU_TREMONT_D] = "Tremont microserver", - [CPU_SAPPHIRERAPIDS] = "Sapphirerapids server", -+ [CPU_EMERALDRAPIDS] = "Emeraldrapids server", - }; - - static enum cputype select_intel_cputype(struct mce_priv *mce) -@@ -118,6 +119,8 @@ static enum cputype select_intel_cputype(struct mce_priv *mce) - return CPU_TREMONT_D; - else if (mce->model == 0x8f) - return CPU_SAPPHIRERAPIDS; -+ else if (mce->model == 0xcf) -+ return CPU_EMERALDRAPIDS; - - if (mce->model > 0x1a) { - log(ALL, LOG_INFO, -diff --git a/ras-mce-handler.h b/ras-mce-handler.h -index b4babf3..68147f4 100644 ---- a/ras-mce-handler.h -+++ b/ras-mce-handler.h -@@ -53,6 +53,7 @@ enum cputype { - CPU_ICELAKE_DE, - CPU_TREMONT_D, - CPU_SAPPHIRERAPIDS, -+ CPU_EMERALDRAPIDS, - }; - - struct mce_event { --- -2.33.1 - diff --git a/1070-rasdaemon-fix-disk-error-log-storm.patch b/1026-anolis-fix-systemd-config.patch similarity index 46% rename from 1070-rasdaemon-fix-disk-error-log-storm.patch rename to 1026-anolis-fix-systemd-config.patch index f60c16ea631cf58f0872b1642fbe2f6108fb4cb0..45770982f50740631b332e895af7bd2785fe6587 100644 --- a/1070-rasdaemon-fix-disk-error-log-storm.patch +++ b/1026-anolis-fix-systemd-config.patch @@ -1,17 +1,18 @@ -From 0854ed0b1791ec1e01827720052310b79e9ff5bc Mon Sep 17 00:00:00 2001 +From 09d282c32c52224af0b7310b24e6ddf4cd4efb61 Mon Sep 17 00:00:00 2001 From: Ruidong Tian -Date: Thu, 12 Dec 2024 20:42:15 +0800 -Subject: [PATCH 70/85] rasdaemon: fix disk error log storm +Date: Fri, 18 Apr 2025 16:47:46 +0800 +Subject: [PATCH 26/30] anolis: fix systemd config +Signed-off-by: Ruidong Tian --- - misc/rasdaemon.service.in | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) + misc/rasdaemon.service.in | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/misc/rasdaemon.service.in b/misc/rasdaemon.service.in -index 4ef3d2c..508aa3c 100644 +index 0bb643f..c72b2d7 100644 --- a/misc/rasdaemon.service.in +++ b/misc/rasdaemon.service.in -@@ -4,7 +4,7 @@ After=syslog.target +@@ -7,10 +7,10 @@ Description=RAS daemon to log the RAS events [Service] EnvironmentFile=@SYSCONFDEFDIR@/rasdaemon @@ -19,7 +20,11 @@ index 4ef3d2c..508aa3c 100644 +ExecStart=@sbindir@/rasdaemon -f ExecStartPost=@sbindir@/rasdaemon --enable ExecStop=@sbindir@/rasdaemon --disable - Restart=on-abort +-Restart=on-abort ++Restart=always + + [Install] + WantedBy=multi-user.target -- -2.33.1 +2.43.5 diff --git a/1026-rasdaemon-ras-mc-ctl-Modify-check-for-HiSilicon-KunP.patch b/1026-rasdaemon-ras-mc-ctl-Modify-check-for-HiSilicon-KunP.patch deleted file mode 100644 index c1820f17931ceb9b43eadd23aa1baacce1eba660..0000000000000000000000000000000000000000 --- a/1026-rasdaemon-ras-mc-ctl-Modify-check-for-HiSilicon-KunP.patch +++ /dev/null @@ -1,122 +0,0 @@ -From 4a9931b5bbd13bdc8911fc6041251f53618fb6d3 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Thu, 24 Aug 2023 13:07:17 +0100 -Subject: [PATCH 26/85] rasdaemon: ras-mc-ctl: Modify check for HiSilicon - KunPeng9xx error fields - -Modify check for valid HiSilicon KunPeng9xx error fields. -Fixes an error data is not printed when it's value is 0. - -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - util/ras-mc-ctl.in | 72 +++++++++++++++++++++++----------------------- - 1 file changed, 36 insertions(+), 36 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 5d8b46c..fb35afe 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1705,13 +1705,13 @@ sub vendor_errors - if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { - $out .= "$id. $timestamp Error Info: "; - $out .= "version=$version, "; -- $out .= "soc_id=$soc_id, " if ($soc_id); -- $out .= "socket_id=$socket_id, " if ($socket_id); -- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -- $out .= "module_id=$module_id, " if ($module_id); -- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "err_severity=$err_severity, " if ($err_severity); -- $out .= "Error Registers: $regs " if ($regs); -+ $out .= "soc_id=$soc_id, " if (defined $soc_id && length $soc_id); -+ $out .= "socket_id=$socket_id, " if (defined $socket_id && length $socket_id); -+ $out .= "nimbus_id=$nimbus_id, " if (defined $nimbus_id && length $nimbus_id); -+ $out .= "module_id=$module_id, " if (defined $module_id && length $module_id); -+ $out .= "sub_module_id=$sub_module_id, " if (defined $sub_module_id && length $sub_module_id); -+ $out .= "err_severity=$err_severity, " if (defined $err_severity && length $err_severity); -+ $out .= "Error Registers: $regs " if (defined $regs && length $regs); - $out .= "\n\n"; - $found_module = 1; - } -@@ -1730,13 +1730,13 @@ sub vendor_errors - if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { - $out .= "$id. $timestamp Error Info: "; - $out .= "version=$version, "; -- $out .= "soc_id=$soc_id, " if ($soc_id); -- $out .= "socket_id=$socket_id, " if ($socket_id); -- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -- $out .= "module_id=$module_id, " if ($module_id); -- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "err_severity=$err_severity, " if ($err_severity); -- $out .= "Error Registers: $regs " if ($regs); -+ $out .= "soc_id=$soc_id, " if (defined $soc_id && length $soc_id); -+ $out .= "socket_id=$socket_id, " if (defined $socket_id && length $socket_id); -+ $out .= "nimbus_id=$nimbus_id, " if (defined $nimbus_id && length $nimbus_id); -+ $out .= "module_id=$module_id, " if (defined $module_id && length $module_id); -+ $out .= "sub_module_id=$sub_module_id, " if (defined $sub_module_id && length $sub_module_id); -+ $out .= "err_severity=$err_severity, " if (defined $err_severity && length $err_severity); -+ $out .= "Error Registers: $regs " if (defined $regs && length $regs); - $out .= "\n\n"; - $found_module = 1; - } -@@ -1755,15 +1755,15 @@ sub vendor_errors - if ($module eq 0 || ($sub_module_id && uc($module) eq uc($sub_module_id))) { - $out .= "$id. $timestamp Error Info: "; - $out .= "version=$version, "; -- $out .= "soc_id=$soc_id, " if ($soc_id); -- $out .= "socket_id=$socket_id, " if ($socket_id); -- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "core_id=$core_id, " if ($core_id); -- $out .= "port_id=$port_id, " if ($port_id); -- $out .= "err_severity=$err_severity, " if ($err_severity); -- $out .= "err_type=$err_type, " if ($err_type); -- $out .= "Error Registers: $regs " if ($regs); -+ $out .= "soc_id=$soc_id, " if (defined $soc_id && length $soc_id); -+ $out .= "socket_id=$socket_id, " if (defined $socket_id && length $socket_id); -+ $out .= "nimbus_id=$nimbus_id, " if (defined $nimbus_id && length $nimbus_id); -+ $out .= "sub_module_id=$sub_module_id, " if (defined $sub_module_id && length $sub_module_id); -+ $out .= "core_id=$core_id, " if (defined $core_id && length $core_id); -+ $out .= "port_id=$port_id, " if (defined $port_id && length $port_id); -+ $out .= "err_severity=$err_severity, " if (defined $err_severity && length $err_severity); -+ $out .= "err_type=$err_type, " if (defined $err_type && length $err_type); -+ $out .= "Error Registers: $regs " if (defined $regs && length $regs); - $out .= "\n\n"; - $found_module = 1; - } -@@ -1782,19 +1782,19 @@ sub vendor_errors - if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) { - $out .= "$id. $timestamp Error Info: "; - $out .= "version=$version, "; -- $out .= "soc_id=$soc_id, " if ($soc_id); -- $out .= "socket_id=$socket_id, " if ($socket_id); -- $out .= "totem_id=$totem_id, " if ($totem_id); -- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id); -- $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id); -- $out .= "module_id=$module_id, " if ($module_id); -- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id); -- $out .= "core_id=$core_id, " if ($core_id); -- $out .= "port_id=$port_id, " if ($port_id); -- $out .= "err_type=$err_type, " if ($err_type); -- $out .= "pcie_info=$pcie_info, " if ($pcie_info); -- $out .= "err_severity=$err_severity, " if ($err_severity); -- $out .= "Error Registers: $regs" if ($regs); -+ $out .= "soc_id=$soc_id, " if (defined $soc_id && length $soc_id); -+ $out .= "socket_id=$socket_id, " if (defined $socket_id && length $socket_id); -+ $out .= "totem_id=$totem_id, " if (defined $totem_id && length $totem_id); -+ $out .= "nimbus_id=$nimbus_id, " if (defined $nimbus_id && length $nimbus_id); -+ $out .= "sub_system_id=$sub_system_id, " if (defined $sub_system_id && length $sub_system_id); -+ $out .= "module_id=$module_id, " if (defined $module_id && length $module_id); -+ $out .= "sub_module_id=$sub_module_id, " if (defined $sub_module_id && length $sub_module_id); -+ $out .= "core_id=$core_id, " if (defined $core_id && length $core_id ); -+ $out .= "port_id=$port_id, " if (defined $port_id && length $port_id); -+ $out .= "err_type=$err_type, " if (defined $err_type && length $err_type); -+ $out .= "pcie_info=$pcie_info, " if (defined $pcie_info && length $pcie_info); -+ $out .= "err_severity=$err_severity, " if (defined $err_severity && length $err_severity); -+ $out .= "Error Registers: $regs" if (defined $regs && length $regs); - $out .= "\n\n"; - $found_module = 1; - } --- -2.33.1 - diff --git a/1027-anolis-Add-dynamic-switch-of-ras-events-support.patch b/1027-anolis-Add-dynamic-switch-of-ras-events-support.patch deleted file mode 100644 index 4d17a1955a50f8d24c5b686c554fa1fea15429d5..0000000000000000000000000000000000000000 --- a/1027-anolis-Add-dynamic-switch-of-ras-events-support.patch +++ /dev/null @@ -1,162 +0,0 @@ -From 80e534e597163ef2fd4fc3bff3d441420914e0d2 Mon Sep 17 00:00:00 2001 -From: caixiaomeng 00662745 -Date: Wed, 29 Nov 2023 14:31:46 +0800 -Subject: [PATCH 27/85] anolis: Add dynamic switch of ras events support. - -Rasdaemon does not support a way to disable some events by config. -If user want to disable specified event(eg:block_rq_complete), he -should recompile rasdaemon, which is not so convenient. - -This patch add dynamic switch of ras event support.You can add -events you want to disabled in /etc/sysconfig/rasdaemon.For example, -`DISABLE="ras:mc_event,block:block_rq_complete"`.Then restart -rasdaemon, these two events will be disabled without recompilation. - -[mchehab: make is_disabled_event() static] -Signed-off-by: Mauro Carvalho Chehab -[Ruidong: delete cxl code] ---- - ras-events.c | 35 ++++++++++++++++++++++++++++------- - rasdaemon.c | 3 +++ - 2 files changed, 31 insertions(+), 7 deletions(-) - -diff --git a/ras-events.c b/ras-events.c -index 9ad34f8..31a4e0b 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -57,6 +57,8 @@ - #define ENDIAN KBUFFER_ENDIAN_BIG - #endif - -+extern char* choices_disable; -+ - static int get_debugfs_dir(char *tracing_dir, size_t len) - { - FILE *fp; -@@ -147,6 +149,18 @@ static int get_tracing_dir(struct ras_events *ras) - return 0; - } - -+static int is_disabled_event(char *group, char *event) { -+ char ras_event_name[MAX_PATH + 1]; -+ -+ snprintf(ras_event_name, sizeof(ras_event_name), "%s:%s", -+ group, event); -+ -+ if (choices_disable != NULL && strlen(choices_disable) != 0 && strstr(choices_disable, ras_event_name)) { -+ return 1; -+ } -+ return 0; -+} -+ - /* - * Tracing enable/disable code - */ -@@ -155,6 +169,7 @@ static int __toggle_ras_mc_event(struct ras_events *ras, - { - int fd, rc; - char fname[MAX_PATH + 1]; -+ enable = is_disabled_event(group, event) ? 0 : 1; - - snprintf(fname, sizeof(fname), "%s%s:%s\n", - enable ? "" : "!", -@@ -775,6 +790,12 @@ static int add_event_handler(struct ras_events *ras, struct pevent *pevent, - - ras->filters[id] = filter; - -+ if (is_disabled_event(group, event)) { -+ log(ALL, LOG_INFO, "Disabled %s:%s tracing from config\n", -+ group, event); -+ return -EINVAL; -+ } -+ - /* Enable RAS events */ - rc = __toggle_ras_mc_event(ras, group, event, 1); - free(page); -@@ -842,7 +863,7 @@ int handle_ras_events(int record_events) - ras_mc_event_handler, NULL, MC_EVENT); - if (!rc) - num_events++; -- else -+ else if (rc != -EINVAL) - log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", - "ras", "mc_event"); - -@@ -851,7 +872,7 @@ int handle_ras_events(int record_events) - ras_aer_event_handler, NULL, AER_EVENT); - if (!rc) - num_events++; -- else -+ else if (rc != -EINVAL) - log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", - "ras", "aer_event"); - #endif -@@ -861,7 +882,7 @@ int handle_ras_events(int record_events) - ras_non_standard_event_handler, NULL, NON_STANDARD_EVENT); - if (!rc) - num_events++; -- else -+ else if (rc != -EINVAL) - log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", - "ras", "non_standard_event"); - #endif -@@ -871,7 +892,7 @@ int handle_ras_events(int record_events) - ras_arm_event_handler, NULL, ARM_EVENT); - if (!rc) - num_events++; -- else -+ else if (rc != -EINVAL) - log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", - "ras", "arm_event"); - #endif -@@ -905,7 +926,7 @@ int handle_ras_events(int record_events) - /* tell kernel we are listening, so don't printk to console */ - (void)open("/sys/kernel/debug/ras/daemon_active", 0); - num_events++; -- } else -+ } else if (rc != -EINVAL) - log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", - "ras", "extlog_mem_event"); - #endif -@@ -922,7 +943,7 @@ int handle_ras_events(int record_events) - ras_devlink_event_handler, filter_str, DEVLINK_EVENT); - if (!rc) - num_events++; -- else -+ else if (rc != -EINVAL) - log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", - "devlink", "devlink_health_report"); - #endif -@@ -946,7 +967,7 @@ int handle_ras_events(int record_events) - ras_memory_failure_event_handler, NULL, MF_EVENT); - if (!rc) - num_events++; -- else -+ else if (rc != -EINVAL) - log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", - "ras", "memory_failure_event"); - #endif -diff --git a/rasdaemon.c b/rasdaemon.c -index e9a3a4d..0db51c9 100644 ---- a/rasdaemon.c -+++ b/rasdaemon.c -@@ -33,6 +33,8 @@ - #define TOOL_NAME "rasdaemon" - #define TOOL_DESCRIPTION "RAS daemon to log the RAS events." - #define ARGS_DOC "" -+#define DISABLE "DISABLE" -+char *choices_disable = NULL; - - const char *argp_program_version = TOOL_NAME " " VERSION; - const char *argp_program_bug_address = "Mauro Carvalho Chehab "; -@@ -127,6 +129,7 @@ int main(int argc, char *argv[]) - { - struct arguments args; - int idx = -1; -+ choices_disable = getenv(DISABLE); - - #ifdef HAVE_MCE - const struct argp_option offline_options[] = { --- -2.33.1 - diff --git a/1027-anolis-add-nvgpu-driver.patch b/1027-anolis-add-nvgpu-driver.patch new file mode 100644 index 0000000000000000000000000000000000000000..ca4bc5c0dfa43357b0e20589b39f8630e69378bd --- /dev/null +++ b/1027-anolis-add-nvgpu-driver.patch @@ -0,0 +1,590 @@ +From ed059449efe2ce84e1c7cffdc5502430052c043e Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Wed, 23 Apr 2025 11:17:32 +0800 +Subject: [PATCH 1/3] anolis: add nvgpu driver + +Signed-off-by: Ruidong Tian +--- + Makefile.am | 22 ++- + configure.ac | 5 + + ras-nvgpu-driver.c | 444 +++++++++++++++++++++++++++++++++++++++++++++ + ras-nvgpu-nvml.c | 2 - + ras-nvgpu.c | 10 +- + ras-nvgpu.h | 2 + + 6 files changed, 476 insertions(+), 9 deletions(-) + create mode 100644 ras-nvgpu-driver.c + +diff --git a/Makefile.am b/Makefile.am +index 203b576..c400473 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -27,7 +27,9 @@ EXTRA_DIST = \ + contrib/rasdaemon.init \ + contrib/nvml.py \ + contrib/nvml.h \ +- contrib/*_trigger ++ contrib/*_trigger \ ++ libnvgpudriver_x86_64.a \ ++ libnvgpudriver_aarch64.a + + CLEANFILES= \ + ras-nvgpu-nvml.h \ +@@ -148,14 +150,16 @@ if WITH_ERST + endif + + if WITH_NVGPU +- BUILT_SOURCES = ras-nvgpu-nvml.h ++ BUILT_SOURCES = ras-nvgpu-nvml.h libnvgpudriver.a + ras-nvgpu-nvml.h: contrib/nvml.py + python3 $< > $@ ++libnvgpudriver.a: nvgpu_driver ++ cp libnvgpudriver_$(shell uname -m).a $@ + rasdaemon_SOURCES += ras-nvgpu.c ras-nvgpu-nvml.c + endif + +-rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl $(ZLIBS) +-rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS) ++rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) $(LIBPCI_LIBS) -ldl $(ZLIBS) $(NVGPU_LIBS) ++rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) $(LIBPCI_CFLAGS) $(NVGPU_CFLAGS) + + include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ +@@ -210,3 +214,13 @@ install-data-local: + install -D -p -m 0655 @abs_srcdir@/misc/rasdaemon.rsyslog-ext "$(DESTDIR)@sysconfdir@/rsyslog.d/rasdaemon.rsyslog-ext"; \ + fi + $(install_sh) @abs_srcdir@/contrib/*_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/" ++ ++nvgpu_driver: ++ if [ ! -d "open-gpu-kernel-modules" ]; then git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git -b 570; fi ++ gcc -o ras-nvgpu-driver.o -I./open-gpu-kernel-modules/kernel-open/common/inc \ ++ -I./open-gpu-kernel-modules/kernel-open/nvidia-uvm \ ++ -I./open-gpu-kernel-modules/src/common/sdk/nvidia/inc \ ++ -I./open-gpu-kernel-modules/src/nvidia/arch/nvalloc/unix/include \ ++ $(LIBTRACEEVENT_LIBS) \ ++ -O2 -fPIE -c ras-nvgpu-driver.c ++ ar rcs libnvgpudriver_$(shell uname -m).a ras-nvgpu-driver.o +diff --git a/configure.ac b/configure.ac +index 68fcb75..46ba36e 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -303,10 +303,15 @@ AC_ARG_ENABLE([nvgpu], + AS_IF([test "x$enable_nvgpu" = "xyes" || test "x$enable_all" == "xyes"], [ + AC_DEFINE(HAVE_NVGPU,1,"have NVGPU events collect") + AC_SUBST([WITH_NVGPU]) ++ NVGPU_LIBS="-lnvgpudriver" ++ NVGPU_CFLAGS="-L." + ]) + AM_CONDITIONAL([WITH_NVGPU], [test x$enable_nvgpu = xyes || test x$enable_all == xyes]) + AM_COND_IF([WITH_NVGPU], [USE_NVGPU="yes"], [USE_NVGPU="no"]) + ++AC_SUBST([NVGPU_LIBS]) ++AC_SUBST([NVGPU_CFLAGS]) ++ + AC_ARG_ENABLE([kmsg_monitor], + AS_HELP_STRING([--enable-kmsg-monitor], [enable kmsg monitor (currently experimental)])) + +diff --git a/ras-nvgpu-driver.c b/ras-nvgpu-driver.c +new file mode 100644 +index 0000000..a72a7c5 +--- /dev/null ++++ b/ras-nvgpu-driver.c +@@ -0,0 +1,444 @@ ++ ++#include "nvtypes.h" ++#include ++#include // NV01_DEVICE_0 ++#include // NV20_SUBDEVICE_0 ++ ++#include ++#include // VOLTA_CHANNELChannelGPFifoA ++#include // NV20_SUBDEVICE_0 ++#include // NV20_SUBDEVICE_0 ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "ras-logger.h" ++#include "ras-nvgpu.h" ++#include ++#define NV_PLATFORM_MAX_IOCTL_SIZE 16384 ++#include "nv.h" ++#include "nvos.h" ++#include "nv_escape.h" ++ ++#include "nvstatus.h" ++ ++#define NV_PRINTF_STRING_SECTION ++#undef NV_STATUS_CODE ++#undef SDK_NVSTATUSCODES_H ++#define NV_STATUS_CODE( name, code, string ) static NV_PRINTF_STRING_SECTION \ ++ const char rm_pvt_##name##_str[] = string " [" #name "]"; ++#include "nvstatuscodes.h" ++ ++#undef NV_STATUS_CODE ++#undef SDK_NVSTATUSCODES_H ++#define NV_STATUS_CODE( name, code, string ) [code] = { name, rm_pvt_##name##_str }, ++static struct NvStatusCodeString ++{ ++ NV_STATUS statusCode; ++ const char *statusString; ++} g_StatusCodeList[] = { ++ #include "nvstatuscodes.h" ++}; ++#undef NV_STATUS_CODE ++ ++#include ++ ++#define assert_with_message(condition, message, ...) \ ++ do { \ ++ if (!(condition)) { \ ++ log(ALL, LOG_ERR, "%s Assertion failed: %s: " message "\n", \ ++ __func__, #condition, ##__VA_ARGS__); \ ++ ret = 1; \ ++ } \ ++ } while (0) ++ ++#define nv_assert_ioctl(fd, cmd, p) \ ++ do { \ ++ int r = ioctl(fd, __NV_IOWR(cmd, p), &p); \ ++ assert_with_message(r == 0, "%s", strerror(r)); \ ++ assert_with_message(p.status == 0, "%s", g_StatusCodeList[p.status].statusString); \ ++ } while (0) ++ ++#define error_exit(a, free) \ ++ do { \ ++ a; \ ++ if (ret) goto free; \ ++ } while (0) ++ ++static int ret; ++static void alloc_root(int fd_ctl, NvHandle *root) { ++ NVOS64_PARAMETERS p = { ++ .hClass = NV01_ROOT_CLIENT ++ }; ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_ALLOC, p); ++ *root = p.hObjectNew; ++} ++ ++static void free_nvgpu(int fd_ctl, NvHandle root, NvHandle obj, NvHandle old_obj) { ++ NVOS00_PARAMETERS p = { ++ .hRoot = root, .hObjectParent = obj, .hObjectOld = old_obj ++ }; ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_FREE, p); ++} ++ ++static void alloc_device(int fd_ctl, NvHandle root, NV0080_ALLOC_PARAMETERS *dev, NvHandle *device) { ++ NVOS64_PARAMETERS p = { ++ .hRoot = root, .hObjectParent = root, .hClass = NV01_DEVICE_0, .pAllocParms = dev, .paramsSize = sizeof(*dev) ++ }; ++ ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_ALLOC, p); ++ *device = p.hObjectNew; ++} ++ ++static void alloc_subdevice(int fd_ctl, NvHandle root, NvHandle parent, NV2080_ALLOC_PARAMETERS *subdev, NvHandle *subdevice) { ++ NVOS64_PARAMETERS p = { ++ .hRoot = root, .hObjectParent = parent, .hClass = NV20_SUBDEVICE_0, .pAllocParms = subdev, .paramsSize = sizeof(*subdev) ++ }; ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_ALLOC, p); ++ ++ *subdevice = p.hObjectNew; ++} ++ ++static void wait_open(int fd_dev) ++{ ++ nv_ioctl_wait_open_complete_t p = { 0 }; ++ ++ int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_WAIT_OPEN_COMPLETE, p), &p); ++ assert_with_message(ret == 0, "%s", strerror(ret)); ++} ++ ++static void get_pci(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_GET_PCI_INFO_PARAMS *pci) { ++ NVOS54_PARAMETERS p = { ++ .hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_GET_PCI_INFO, .params = pci, .paramsSize = sizeof(*pci) ++ }; ++ ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p); ++} ++ ++static void attach_id(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_ATTACH_IDS_PARAMS *attach) { ++ NVOS54_PARAMETERS p = { ++ .hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_ATTACH_IDS, .params = attach, .paramsSize = sizeof(*attach) ++ }; ++ ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p); ++} ++ ++static void deattach_id(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_DETACH_IDS_PARAMS *attach) { ++ NVOS54_PARAMETERS p = { ++ .hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_DETACH_IDS, .params = attach, .paramsSize = sizeof(*attach) ++ }; ++ ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p); ++} ++ ++static void get_id(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_GET_PROBED_IDS_PARAMS *probe) { ++ NVOS54_PARAMETERS p = { ++ .hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_GET_PROBED_IDS, .params = probe, .paramsSize = sizeof(*probe) ++ }; ++ ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p); ++} ++ ++static void get_id_info(int fd_ctl, NvHandle root, NV0000_CTRL_GPU_GET_ID_INFO_PARAMS *info) { ++ NVOS54_PARAMETERS p = { ++ .hClient = root, .hObject = root, .cmd = NV0000_CTRL_CMD_GPU_GET_ID_INFO, .params = info, .paramsSize = sizeof(*info) ++ }; ++ ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p); ++} ++ ++static void register_fd(int fd_dev, int fd_ctl) { ++ nv_ioctl_register_fd_t p = { .ctl_fd = fd_ctl }; ++ int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_REGISTER_FD, p), &p); ++ assert(ret == 0); ++} ++ ++static void alloc_event(int fd_dev, NvHandle root, NvHandle device, int fd_uvm) { ++ nv_ioctl_alloc_os_event_t p = { .hClient = root, .hDevice = device, .fd = fd_uvm }; ++ int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_ALLOC_OS_EVENT, p), &p); ++ assert(ret == 0); ++} ++ ++static void free_event(int fd_dev, NvHandle root, NvHandle device) { ++ nv_ioctl_alloc_os_event_t p = { .hClient = root, .hDevice = device, .fd = fd_dev }; ++ int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_ALLOC_OS_EVENT, p), &p); ++ assert(ret == 0); ++} ++ ++static void event_os_event(int fd_dev, NvHandle root, NvHandle subdevice, int index, NvHandle *event, int fd_uvm) { ++ NV0005_ALLOC_PARAMETERS pp = { .hParentClient = root, .data = (NvP64)fd_uvm, .notifyIndex = index, .hClass = NV01_EVENT_OS_EVENT }; ++ ++ NVOS64_PARAMETERS p = { ++ .hRoot = root, .hObjectParent = subdevice, .hClass = NV01_EVENT_OS_EVENT, .pAllocParms = &pp, .paramsSize = sizeof(pp) ++ }; ++ ++ nv_assert_ioctl(fd_dev, NV_ESC_RM_ALLOC, p); ++ *event = p.hObjectNew; ++} ++ ++static void set_event(int fd_ctl, NvHandle root, NvHandle subdevice, int index, int type) ++{ ++ NV2080_CTRL_EVENT_SET_NOTIFICATION_PARAMS set = { .event = index, .action = type, .bNotifyState = 0 }; ++ ++ NVOS54_PARAMETERS p = { ++ .hClient = root, .hObject = subdevice, .cmd = NV2080_CTRL_CMD_EVENT_SET_NOTIFICATION, .params = &set, .paramsSize = sizeof(set) ++ }; ++ ++ nv_assert_ioctl(fd_ctl, NV_ESC_RM_CONTROL, p); ++} ++ ++static void get_event(NvUnixEvent *event, int fd_dev, int fd_uvm, NvHandle root, NvHandle subdevice, int i) ++{ ++ NVOS41_PARAMETERS p = { .pEvent = event, .MoreEvents = 0 }; ++ int ret = ioctl(fd_dev, __NV_IOWR(NV_ESC_RM_GET_EVENT_DATA, p), &p); ++ assert(ret == 0); ++} ++ ++struct ras_nvgpu_event { ++ NvHandle event; ++ NvV32 index; ++}; ++ ++#define NVGPU_EVENT_NUM 10 ++struct ras_nvgpu_driver { ++ NvHandle device; ++ NvHandle subdevice; ++ NvU32 gpu_id; ++ int fd; ++ NV0000_CTRL_GPU_GET_PCI_INFO_PARAMS pci; ++ struct ras_nvgpu_event events[NVGPU_EVENT_NUM]; ++}; ++ ++static int event_index[NVGPU_EVENT_NUM] = { ++ NV2080_NOTIFIERS_RC_ERROR, ++ NV2080_NOTIFIERS_ECC_DBE, ++ NV2080_NOTIFIERS_NVLINK_ERROR_FATAL, ++ NV2080_NOTIFIERS_NVLINK_ERROR_RECOVERY_REQUIRED, ++ NV2080_NOTIFIERS_POISON_ERROR_NON_FATAL, ++ NV2080_NOTIFIERS_POISON_ERROR_FATAL, ++ NV2080_NOTIFIERS_NVLINK_INFO_LINK_DOWN, ++ NV2080_NOTIFIERS_ECC_SBE_STORM, ++ NV2080_NOTIFIERS_NVLINK_UNCONTAINED_ERROR, ++ NV2080_NOTIFIERS_GPU_UNAVAILABLE ++}; ++ ++static int report_ras_nvgpu_driver(struct ras_nvgpu_driver *nvgpu, NvUnixEvent *event) ++{ ++ struct trace_seq s; ++ time_t now; ++ struct tm *tm; ++ char timestamp[64]; ++ ++ time(&now); ++ tm = localtime(&now); ++ ++ if (tm) ++ strftime(timestamp, sizeof(timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ ++ trace_seq_init(&s); ++ if (event->NotifyIndex == NV2080_NOTIFIERS_RC_ERROR) { ++ trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", ++ "<...>", 0, -1, "....", 0.0f, XID_EVENT_NAME); ++ trace_seq_printf(&s, "%s %s ", loglevel_str[LOGLEVEL_CRIT], timestamp); ++ trace_seq_printf(&s, "xid: %d ", event->info32); ++ trace_seq_printf(&s, "data1: %d ", event->info16); ++ } else { ++ trace_seq_printf(&s, "%16s-%-10d [%03d] %s %6.6f %25s: ", ++ "<...>", 0, -1, "....", 0.0f, NVGPU_EVENT_NAME); ++ trace_seq_printf(&s, "%s %s ", loglevel_str[LOGLEVEL_CRIT], timestamp); ++ trace_seq_printf(&s, "event_type: %d ", event->NotifyIndex); ++ trace_seq_printf(&s, "data: %d ", event->info32); ++ trace_seq_printf(&s, "data1: %d ", event->info16); ++ ++ } ++ ++ trace_seq_printf(&s, "pci_port: %08X:%02X:%02X.0 ", nvgpu->pci.domain, nvgpu->pci.bus, nvgpu->pci.slot); ++ ++ trace_seq_terminate(&s); ++ trace_seq_do_printf(&s); ++ printf("\n"); ++ fflush(stdout); ++ trace_seq_destroy(&s); ++ ++ return 0; ++} ++ ++int ras_nvgpu_driver_handle(void) { ++ int fd_ctl = 0, fd_uvm = 0, i, gpu_count = 0; ++ NvHandle root = 0; ++ struct pollfd *pfd; ++ ++ fd_uvm = open("/dev/nvidia-uvm", O_RDWR | O_CLOEXEC); ++ if (fd_ctl < 0) { ++ perror("open"); ++ return 1; ++ } ++ ++ fd_ctl = open("/dev/nvidiactl", O_RDWR | O_CLOEXEC); ++ if (fd_ctl < 0) { ++ perror("open"); ++ ret = 1; ++ goto close_uvm; ++ } ++ ++ error_exit(alloc_root(fd_ctl, &root), close); ++ ++ NV0000_CTRL_GPU_GET_PROBED_IDS_PARAMS id = {0}; ++ NV0000_CTRL_GPU_ATTACH_IDS_PARAMS attach = {0}; ++ NV0000_CTRL_GPU_DETACH_IDS_PARAMS detach = {0}; ++ error_exit(get_id(fd_ctl, root, &id), free_root); ++ ++ for (i = 0; i < NV0000_CTRL_GPU_MAX_PROBED_GPUS; i++) { ++ if (id.gpuIds[i] == NV0000_CTRL_GPU_INVALID_ID) ++ break; ++ ++ attach.gpuIds[i] = id.gpuIds[i]; ++ detach.gpuIds[i] = id.gpuIds[i]; ++ } ++ gpu_count = i; ++ attach.gpuIds[i] = NV0000_CTRL_GPU_INVALID_ID; ++ ++ error_exit(attach_id(fd_ctl, root, &attach), free_root); ++ ++ struct ras_nvgpu_driver *nvgpus = calloc(gpu_count, sizeof(struct ras_nvgpu_driver)); ++ if (!nvgpus) { ++ log(ALL, LOG_ERR, "nvgpu alloc error\n"); ++ ret = 1; ++ goto detach; ++ } ++ ++ for (i = 0; i < gpu_count; i++) { ++ char path[32]; ++ struct ras_nvgpu_driver *nvgpu = &nvgpus[i]; ++ NV0000_CTRL_GPU_GET_PCI_INFO_PARAMS pci = {0}; ++ NV0000_CTRL_GPU_GET_ID_INFO_PARAMS info = {0}; ++ NV0080_ALLOC_PARAMETERS dev = { 0 }; ++ NV2080_ALLOC_PARAMETERS subdev = { 0 }; ++ NvU32 gpu_id = id.gpuIds[i]; ++ int fd; ++ ++ nvgpu->gpu_id = gpu_id; ++ snprintf(path, 32, "/dev/nvidia%d", i); ++ nvgpu->fd = open(path, O_RDWR | O_CLOEXEC); ++ if (nvgpu->fd < 0) { ++ log(ALL, LOG_ERR, "nvgpu open error\n"); ++ goto free_nvgpu; ++ } ++ fd = nvgpu->fd; ++ ++ error_exit(wait_open(fd), free_nvgpu); ++ ++ pci.gpuId = gpu_id; ++ error_exit(get_pci(fd_ctl, root, &pci), free_nvgpu); ++ nvgpu->pci = pci; ++ ++ info.gpuId = id.gpuIds[i]; ++ error_exit(get_id_info(fd_ctl, root, &info), free_nvgpu); ++ ++ error_exit(register_fd(fd, fd_ctl), free_nvgpu); ++ ++ dev.deviceId = info.deviceInstance; ++ error_exit(alloc_device(fd_ctl, root, &dev, &nvgpu->device), free_nvgpu); ++ ++ subdev.subDeviceId = info.subDeviceInstance; ++ error_exit(alloc_subdevice(fd_ctl, root, nvgpu->device, &subdev, &nvgpu->subdevice), free_nvgpu); ++ ++ error_exit(alloc_event(fd, root, nvgpu->device, fd_uvm), free_nvgpu); ++ ++ for (int j = 0; j < NVGPU_EVENT_NUM; j++) { ++ struct ras_nvgpu_event *event = &nvgpu->events[j]; ++ event->index = event_index[j]; ++ ++ event_os_event(fd, root, nvgpu->subdevice, event->index, &event->event, fd_uvm); ++ if (ret) { ++ log(ALL, LOG_ERR, "nvgpu event %d register error\n", event->index); ++ ret = 0; ++ continue; ++ } ++ set_event(fd_ctl, root, nvgpu->subdevice, event->index, NV2080_CTRL_EVENT_SET_NOTIFICATION_ACTION_REPEAT); ++ if (ret) { ++ log(ALL, LOG_ERR, "nvgpu event %d set error\n", event->index); ++ free_nvgpu(fd_ctl, root, nvgpu->subdevice, event->event); ++ ret = 0; ++ continue; ++ } ++ } ++ log(ALL, LOG_INFO, "GPU %d: %04x:%02x:%02x.0 found, deviceid %d subdeviceid %d\n", ++ nvgpu->gpu_id, nvgpu->pci.domain, nvgpu->pci.bus, nvgpu->pci.slot, info.deviceInstance, info.subDeviceInstance); ++ } ++ ++ pfd = malloc(sizeof(struct pollfd) * gpu_count); ++ if (!pfd) { ++ log(ALL, LOG_ERR, "nvgpu alloc error\n"); ++ ret = 1; ++ goto free_nvgpu; ++ } ++ ++ for (i = 0; i < gpu_count; i++) { ++ pfd[i].fd = nvgpus[i].fd; ++ pfd[i].events = POLLIN | POLLPRI; ++ } ++ ++ while (1) { ++ if (poll(pfd, gpu_count, -1) < 0) { ++ log(ALL, LOG_ERR, "nvgpu poll error\n"); ++ goto free_pfd; ++ } ++ ++ for (i = 0; i < gpu_count; i++) { ++ if (pfd[i].revents & POLLIN) { ++ NvUnixEvent event; ++ ++ get_event(&event, nvgpus[i].fd, fd_uvm, root, nvgpus[i].subdevice, 25); ++ ++ report_ras_nvgpu_driver(&nvgpus[i], &event); ++ } ++ } ++ } ++ ++free_pfd: ++ free(pfd); ++free_nvgpu: ++ for (i = 0; i < gpu_count; i++) { ++ struct ras_nvgpu_driver *nvgpu = &nvgpus[i]; ++ ++ for (int j = 0; j < NVGPU_EVENT_NUM; j++) { ++ struct ras_nvgpu_event *event = &nvgpu->events[j]; ++ ++ if (event->event) { ++ set_event(fd_ctl, root, nvgpu->subdevice, event->index, NV2080_CTRL_EVENT_SET_NOTIFICATION_ACTION_DISABLE); ++ free_nvgpu(fd_ctl, root, nvgpus->subdevice, event->event); ++ } ++ } ++ free_event(nvgpu->fd, root, nvgpu->device); ++ if (nvgpu->subdevice) ++ free_nvgpu(fd_ctl, root, nvgpu->device, nvgpu->subdevice); ++ if (nvgpu->device) ++ free_nvgpu(fd_ctl, root, nvgpu->device, 0); ++ if (nvgpu->device) ++ free_nvgpu(fd_ctl, root, root, nvgpu->device); ++ if (nvgpu->fd) ++ close(nvgpu->fd); ++ } ++detach: ++ deattach_id(fd_ctl, root, &detach); ++free_root: ++ free_nvgpu(fd_ctl, root, root, root); ++close: ++ close(fd_ctl); ++close_uvm: ++ close(fd_uvm); ++ ++ return ret; ++} +\ No newline at end of file +diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c +index 2758d14..541ff69 100644 +--- a/ras-nvgpu-nvml.c ++++ b/ras-nvgpu-nvml.c +@@ -14,8 +14,6 @@ + #include "trace-seq.h" + #include "types.h" + +-#define XID_EVENT_NAME "xid" +- + const char *lib_name[] = { + "/lib64/libnvidia-ml.so", + "/lib64/libnvidia-ml.so.1", +diff --git a/ras-nvgpu.c b/ras-nvgpu.c +index 5c63279..4d39de2 100644 +--- a/ras-nvgpu.c ++++ b/ras-nvgpu.c +@@ -43,12 +43,16 @@ void *ras_nvgpu_handle(void *arg) + + while (retry--) { + if (ras_nvgpu_nvml_handle()) { +- log(ALL, LOG_ERR, "NVGPU handle retry %d\n", retry); +- sleep(10); ++ log(ALL, LOG_ERR, "NVGPU nvml handle retry %d\n", retry); ++ sleep(1); + } + } + +- log(ALL, LOG_ERR, "NVGPU handle fail, exit from nvgpu thread\n"); ++ log(ALL, LOG_ERR, "NVGPU nvml handle fail, try for nvgpu driver call\n"); ++ ++ ras_nvgpu_driver_handle(); ++ ++ log(ALL, LOG_ERR, "NVGPU driver handle fail, exit nvgpu thread\n"); + + return NULL; + } +diff --git a/ras-nvgpu.h b/ras-nvgpu.h +index 32827ad..bade7e4 100644 +--- a/ras-nvgpu.h ++++ b/ras-nvgpu.h +@@ -8,7 +8,9 @@ + #define __RAS_NVGPU_H + + #define NVGPU_EVENT_NAME "nvgpu" ++#define XID_EVENT_NAME "xid" + + void *ras_nvgpu_handle(void *arg); + int ras_nvgpu_nvml_handle(void); ++int ras_nvgpu_driver_handle(void); + #endif +-- +2.43.5 + diff --git a/1028-anolis-add-trigger-for-nvgpu-event.patch b/1028-anolis-add-trigger-for-nvgpu-event.patch new file mode 100644 index 0000000000000000000000000000000000000000..8c17b64c66c7f3967258bd89889656f775b6a98f --- /dev/null +++ b/1028-anolis-add-trigger-for-nvgpu-event.patch @@ -0,0 +1,241 @@ +From 67fcdb9008b17555b0ea0d4c791f3ac772ee682c Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 25 Apr 2025 10:20:16 +0800 +Subject: [PATCH 2/3] anolis: add trigger for nvgpu event + +Signed-off-by: Ruidong Tian +--- + contrib/nvgpu_trigger | 25 +++++++++++++++++++++++++ + misc/rasdaemon.env | 3 +++ + ras-nvgpu-driver.c | 7 ++++++- + ras-nvgpu-nvml.c | 8 +++++++- + ras-nvgpu.c | 3 +++ + trigger.c | 35 +++++++++++++++++++++++++++++++++++ + trigger.h | 1 + + 7 files changed, 80 insertions(+), 2 deletions(-) + create mode 100755 contrib/nvgpu_trigger + +diff --git a/contrib/nvgpu_trigger b/contrib/nvgpu_trigger +new file mode 100755 +index 0000000..48955af +--- /dev/null ++++ b/contrib/nvgpu_trigger +@@ -0,0 +1,25 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon in daemon mode when a ++# memory_failure_event is occured, environment variables include all ++# information reported by tracepoint. ++ ++# environment: ++# BDF ++# EVENT_TYPE ++# DATA1 ++# DATA2 ++# ++ ++[ -x ./nvgpu_trigger.local ] && . ./nvgpu_trigger.local ++ ++if [ -d nvgpu_trigger.extern ] ++then ++ ls nvgpu_trigger.extern | ++ while read item ++ do ++ [ -x ./nvgpu_trigger.extern/$item ] && . ./nvgpu_trigger.extern/$item ++ done ++fi ++ ++exit 0 +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 198b050..b08afa6 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -119,6 +119,9 @@ POST_PAGE_OFFLINE_TRIGGER_TIMEOUT=0 + KMSG_TRIGGER= + KMSG_TRIGGER_TIMEOUT=0 + ++NVGPU_TRIGGER= ++NVGPU_TRIGGER_TIMEOUT=0 ++ + # CE Statistic Threshold + # + # Specify the threshold of CE per second. +diff --git a/ras-nvgpu-driver.c b/ras-nvgpu-driver.c +index a72a7c5..9093292 100644 +--- a/ras-nvgpu-driver.c ++++ b/ras-nvgpu-driver.c +@@ -24,6 +24,7 @@ + + #include "ras-logger.h" + #include "ras-nvgpu.h" ++#include "trigger.h" + #include + #define NV_PLATFORM_MAX_IOCTL_SIZE 16384 + #include "nv.h" +@@ -238,6 +239,7 @@ static int report_ras_nvgpu_driver(struct ras_nvgpu_driver *nvgpu, NvUnixEvent * + time_t now; + struct tm *tm; + char timestamp[64]; ++ char tmpbuf[64]; + + time(&now); + tm = localtime(&now); +@@ -263,7 +265,8 @@ static int report_ras_nvgpu_driver(struct ras_nvgpu_driver *nvgpu, NvUnixEvent * + + } + +- trace_seq_printf(&s, "pci_port: %08X:%02X:%02X.0 ", nvgpu->pci.domain, nvgpu->pci.bus, nvgpu->pci.slot); ++ snprintf(tmpbuf, sizeof(tmpbuf), "%08X:%02X:%02X.0 ", nvgpu->pci.domain, nvgpu->pci.bus, nvgpu->pci.slot); ++ trace_seq_printf(&s, "pci_port: %s ", tmpbuf); + + trace_seq_terminate(&s); + trace_seq_do_printf(&s); +@@ -271,6 +274,8 @@ static int report_ras_nvgpu_driver(struct ras_nvgpu_driver *nvgpu, NvUnixEvent * + fflush(stdout); + trace_seq_destroy(&s); + ++ run_nvgpu_trigger(tmpbuf, event->NotifyIndex, event->info32, event->info16); ++ + return 0; + } + +diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c +index 541ff69..f2421a1 100644 +--- a/ras-nvgpu-nvml.c ++++ b/ras-nvgpu-nvml.c +@@ -4,6 +4,7 @@ + * Copyright (C) 2025 Alibaba Inc + */ + ++#include + #include + #include + #include +@@ -13,6 +14,7 @@ + #include "ras-nvgpu.h" + #include "trace-seq.h" + #include "types.h" ++#include "trigger.h" + + const char *lib_name[] = { + "/lib64/libnvidia-ml.so", +@@ -42,6 +44,7 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) + time_t now; + struct tm *tm; + char timestamp[64]; ++ char tmpbuf[64]; + + time(&now); + tm = localtime(&now); +@@ -66,7 +69,8 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) + trace_seq_printf(&s, "data: %lld ", data->eventData); + } + +- trace_seq_printf(&s, "pci_port: " NVML_DEVICE_PCI_BUS_ID_FMT " ", NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci)); ++ snprintf(tmpbuf, sizeof(tmpbuf), NVML_DEVICE_PCI_BUS_ID_FMT, NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci)); ++ trace_seq_printf(&s, "pci_port: %s ", tmpbuf); + trace_seq_printf(&s, "gpu-i: %x ", data->gpuInstanceId); + trace_seq_printf(&s, "gpu-ci: %x ", data->computeInstanceId); + +@@ -76,6 +80,8 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) + fflush(stdout); + trace_seq_destroy(&s); + ++ run_nvgpu_trigger(tmpbuf, data->eventType, data->eventData, 0); ++ + return 0; + } + +diff --git a/ras-nvgpu.c b/ras-nvgpu.c +index 4d39de2..37a8833 100644 +--- a/ras-nvgpu.c ++++ b/ras-nvgpu.c +@@ -15,6 +15,7 @@ + #include "ras-events.h" + #include "ras-logger.h" + #include "ras-nvgpu.h" ++#include "trigger.h" + void *ras_nvgpu_handle(void *arg) + { + (void)arg; +@@ -41,6 +42,8 @@ void *ras_nvgpu_handle(void *arg) + return NULL; + } + ++ setup_event_trigger("nvgpu_event"); ++ + while (retry--) { + if (ras_nvgpu_nvml_handle()) { + log(ALL, LOG_ERR, "NVGPU nvml handle retry %d\n", retry); +diff --git a/trigger.c b/trigger.c +index d410137..e113077 100644 +--- a/trigger.c ++++ b/trigger.c +@@ -101,6 +101,8 @@ struct event_trigger post_page_offline_trigger = {"page_offline", "POST_PAGE_OFF + + struct event_trigger kmsg_trigger = {"kmsg_monitor", "KMSG_TRIGGER"}; + ++struct event_trigger nvgpu_trigger = {"nvgpu_event", "NVGPU_TRIGGER"}; ++ + static struct event_trigger *event_triggers[] = { + &mc_ue_trigger, + #ifdef HAVE_MCE +@@ -122,6 +124,9 @@ static struct event_trigger *event_triggers[] = { + #ifdef HAVE_KMSG_MONITOR + &kmsg_trigger, + #endif ++#ifdef HAVE_NVGPU ++ &nvgpu_trigger, ++#endif + }; + + void setup_event_trigger(const char *event) +@@ -476,3 +481,33 @@ free: + free(env[i]); + } + ++void run_nvgpu_trigger(char *pci_bdf, int event_type, int data1, int data2) ++{ ++ char *env[MAX_ENV]; ++ int ei = 0; ++ struct event_trigger *trigger = &nvgpu_trigger; ++ ++ if (!trigger->path || !strcmp(trigger->path, "")) ++ return; ++ ++ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) ++ goto free; ++ if (asprintf(&env[ei++], "BDF=%s", pci_bdf) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "EVENT_TYPE=%d", event_type) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "DATA1=%d", data1) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "DATA2=%d", data1) < 0) ++ goto free; ++ ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(trigger, NULL, env); ++ ++free: ++ for (int i = 0; i < ei; i++) ++ free(env[i]); ++} ++ +diff --git a/trigger.h b/trigger.h +index b5a6c2c..2ea2b09 100644 +--- a/trigger.h ++++ b/trigger.h +@@ -29,6 +29,7 @@ void run_mf_event_trigger(struct ras_mf_event *e); + void run_aer_event_trigger(struct ras_aer_event *e); + void run_page_offline_trigger(unsigned long long addr, int otype, int type); + void run_kmsg_trigger(struct kmsg_tracer_info *kmsg_tracer, const char *msg); ++void run_nvgpu_trigger(char *pci_bdf, int event_type, int data1, int data2); + + + #endif +-- +2.43.5 + diff --git a/1028-rasdaemon-Add-support-for-creating-vendor-tables-at-.patch b/1028-rasdaemon-Add-support-for-creating-vendor-tables-at-.patch deleted file mode 100644 index 26757a16e1e6d76f0b3e23aed74f035ea298850e..0000000000000000000000000000000000000000 --- a/1028-rasdaemon-Add-support-for-creating-vendor-tables-at-.patch +++ /dev/null @@ -1,72 +0,0 @@ -From 83149843435ffa5f22bc12ca67cd17b1b94fd3c0 Mon Sep 17 00:00:00 2001 -From: Hunter He -Date: Wed, 6 Dec 2023 14:52:03 +0800 -Subject: [PATCH 28/85] rasdaemon:Add support for creating vendor tables at - startup. - -When rasdaemon is running without non-standard error, those -tables are not created in the database file. Then ras-mc-ctl -script breaks trying to query data from non-existent tables. - -Add support for creating vendor tables at startup. - -Signed-off-by: Hunter He ---- - non-standard-yitian.c | 27 +++++++++++++++++---------- - 1 file changed, 17 insertions(+), 10 deletions(-) - -diff --git a/non-standard-yitian.c b/non-standard-yitian.c -index 99cea47..4c30514 100644 ---- a/non-standard-yitian.c -+++ b/non-standard-yitian.c -@@ -164,16 +164,6 @@ void decode_yitian_ddr_payload_err_regs(struct ras_ns_ev_decoder *ev_decoder, - const char *subtype_str = oem_subtype_name(yitian_payload_error_type, - header->type, header->subtype); - --#ifdef HAVE_SQLITE3 -- if (ras->record_events && !ev_decoder->stmt_dec_record) { -- if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, -- &yitian_ddr_payload_section_tab) != SQLITE_OK) { -- trace_seq_printf(s, "create sql fail\n"); -- return; -- } -- } --#endif -- - now = time(NULL); - tm = localtime(&now); - if (tm) -@@ -217,6 +207,22 @@ void decode_yitian_ddr_payload_err_regs(struct ras_ns_ev_decoder *ev_decoder, - - } - -+static int add_yitian_common_table(struct ras_events *ras, -+ struct ras_ns_ev_decoder *ev_decoder) -+{ -+#ifdef HAVE_SQLITE3 -+ if (ras->record_events && !ev_decoder->stmt_dec_record) { -+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, -+ &yitian_ddr_payload_section_tab) != SQLITE_OK) { -+ log(TERM, LOG_WARNING, -+ "Failed to create sql yitian_ddr_payload_section_tab\n"); -+ return -1; -+ } -+ } -+#endif -+ return 0; -+} -+ - /* error data decoding functions */ - static int decode_yitian710_ns_error(struct ras_events *ras, - struct ras_ns_ev_decoder *ev_decoder, -@@ -239,6 +245,7 @@ static int decode_yitian710_ns_error(struct ras_events *ras, - struct ras_ns_ev_decoder yitian_ns_oem_decoder[] = { - { - .sec_type = "a6980811-16ea-4e4d-b936-fb00a23ff29c", -+ .add_table = add_yitian_common_table, - .decode = decode_yitian710_ns_error, - }, - }; --- -2.33.1 - diff --git a/1029-Fix-potential-overflow-with-some-arrays-at-page-isol.patch b/1029-Fix-potential-overflow-with-some-arrays-at-page-isol.patch deleted file mode 100644 index 215c36fbcc113794d6d91a284f79018ddacc5f5c..0000000000000000000000000000000000000000 --- a/1029-Fix-potential-overflow-with-some-arrays-at-page-isol.patch +++ /dev/null @@ -1,118 +0,0 @@ -From c497bd6b0d18efa9f7cf4fe49e183fe971868754 Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Thu, 7 Dec 2023 10:26:56 +0800 -Subject: [PATCH 29/85] Fix potential overflow with some arrays at - page-isolation logic - -Overflows may happen in the `threshold_string` and `cycle_string` arrays. - -If the PAGE_CE_THRESHOLD value in page isolation is set to 50 bits, -there is a risk of array overflow. Because sprintf is an insecure -function, use snprintf instead. - -An error is reported when the AddressSanitizer is used. - -rasdaemon: Improper PAGE_CE_ACTION, set to default soft -rasdaemon: Page offline choice on Corrected Errors is soft -================================================================= -==221920==ERROR: AddressSanitizer: stack-buffer-overflow on address 0xffffdd91d932 at pc 0xffffa24071c4 bp 0xffffdd91d720 sp 0xffffdd91ced8 -WRITE of size 55 at 0xffffdd91d932 thread T0 - #0 0xffffa24071c0 in vsprintf (/usr/lib64/libasan.so.6+0x5c1c0) - #1 0xffffa24073cc in sprintf (/usr/lib64/libasan.so.6+0x5c3cc) - #2 0x459558 in parse_env_string /home/rasdaemon/ras-page-isolation.c:185 - #3 0x4596f4 in page_isolation_init /home/rasdaemon/ras-page-isolation.c:202 - #4 0x459934 in ras_page_account_init /home/rasdaemon/ras-page-isolation.c:211 - #5 0x40f700 in handle_ras_events /home/rasdaemon/ras-events.c:902 - #6 0x405b8c in main /home/rasdaemon/rasdaemon.c:211 - #7 0xffffa20b6f38 in __libc_start_call_main ../sysdeps/nptl/libc_start_call_main.h:58 - #8 0xffffa20b7004 in __libc_start_main_impl ../csu/libc-start.c:409 - #9 0x4038ec in _start (/home/rasdaemon/rasdaemon+0x4038ec) - -Address 0xffffdd91d932 is located in stack of thread T0 at offset 82 in frame - #0 0x459574 in page_isolation_init /home/rasdaemon/ras-page-isolation.c:190 - - This frame has 2 object(s): - [32, 82) 'threshold_string' (line 191) - [128, 178) 'cycle_string' (line 192) <== Memory access at offset 82 partially underflows this variable -HINT: this may be a false positive if your program uses some custom stack unwind mechanism, swapcontext or vfork - (longjmp and C++ exceptions *are* supported) -SUMMARY: AddressSanitizer: stack-buffer-overflow (/usr/lib64/libasan.so.6+0x5c1c0) in vsprintf -Shadow bytes around the buggy address: - 0x200ffbb23ad0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 - 0x200ffbb23ae0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 - 0x200ffbb23af0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 - 0x200ffbb23b00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 - 0x200ffbb23b10: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 -=>0x200ffbb23b20: 00 00 00 00 00 00[02]f2 f2 f2 f2 f2 00 00 00 00 - 0x200ffbb23b30: 00 00 02 f3 f3 f3 f3 f3 00 00 00 00 00 00 00 00 - 0x200ffbb23b40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 - 0x200ffbb23b50: f1 f1 f1 f1 f1 f1 04 f2 00 00 f2 f2 00 00 00 00 - 0x200ffbb23b60: 00 00 00 f2 f2 f2 f2 f2 00 00 00 00 00 00 00 f2 - 0x200ffbb23b70: f2 f2 f2 f2 00 00 00 00 00 00 00 00 f2 f2 f2 f2 -Shadow byte legend (one shadow byte represents 8 application bytes): - Addressable: 00 - Partially addressable: 01 02 03 04 05 06 07 - Heap left redzone: fa - Freed heap region: fd - Stack left redzone: f1 - Stack mid redzone: f2 - Stack right redzone: f3 - Stack after return: f5 - Stack use after scope: f8 - Global redzone: f9 - Global init order: f6 - Poisoned by user: f7 - Container overflow: fc - Array cookie: ac - Intra object redzone: bb - ASan internal: fe - Left alloca redzone: ca - Right alloca redzone: cb - Shadow gap: cc -==221920==ABORTING - -Signed-off-by: Mauro Carvalho Chehab ---- - ras-page-isolation.c | 10 +++++----- - 1 file changed, 5 insertions(+), 5 deletions(-) - -diff --git a/ras-page-isolation.c b/ras-page-isolation.c -index fd7bd70..caa8c31 100644 ---- a/ras-page-isolation.c -+++ b/ras-page-isolation.c -@@ -171,18 +171,18 @@ parse: - config->unit = no_unit ? config->unit : ""; - } - --static void parse_env_string(struct isolation *config, char *str) -+static void parse_env_string(struct isolation *config, char *str, unsigned int size) - { - int i; - - if (config->overflow) { - /* when overflow, use basic unit */ - for (i = 0; config->units[i].name; i++) ; -- sprintf(str, "%lu%s", config->val, config->units[i-1].name); -+ snprintf(str, size, "%lu%s", config->val, config->units[i-1].name); - log(TERM, LOG_INFO, "%s is set overflow(%s), truncate it\n", - config->name, config->env); - } else { -- sprintf(str, "%s%s", config->env, config->unit); -+ snprintf(str, size, "%s%s", config->env, config->unit); - } - } - -@@ -199,8 +199,8 @@ static void page_isolation_init(void) - - parse_isolation_env(&threshold); - parse_isolation_env(&cycle); -- parse_env_string(&threshold, threshold_string); -- parse_env_string(&cycle, cycle_string); -+ parse_env_string(&threshold, threshold_string, sizeof(threshold_string)); -+ parse_env_string(&cycle, cycle_string, sizeof(cycle_string)); - log(TERM, LOG_INFO, "Threshold of memory Corrected Errors is %s / %s\n", - threshold_string, cycle_string); - } --- -2.33.1 - diff --git a/1029-anolis-add-nvgpu-reset-trigger.patch b/1029-anolis-add-nvgpu-reset-trigger.patch new file mode 100644 index 0000000000000000000000000000000000000000..3cb9e5b02f914e82a05a317a35c5ee4a82b3f171 --- /dev/null +++ b/1029-anolis-add-nvgpu-reset-trigger.patch @@ -0,0 +1,76 @@ +From 866c8169c9376f7c0b8a23966caaf099ebbeee9e Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 25 Apr 2025 14:11:30 +0800 +Subject: [PATCH 3/3] anolis: add nvgpu reset trigger + +Signed-off-by: Ruidong Tian +--- + contrib/nvgpu_reset_trigger | 40 +++++++++++++++++++++++++++++++++++++ + contrib/rasdaemon.init | 4 ++++ + 2 files changed, 44 insertions(+) + create mode 100755 contrib/nvgpu_reset_trigger + +diff --git a/contrib/nvgpu_reset_trigger b/contrib/nvgpu_reset_trigger +new file mode 100755 +index 0000000..769e5e2 +--- /dev/null ++++ b/contrib/nvgpu_reset_trigger +@@ -0,0 +1,40 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon in daemon mode when a ++# memory_failure_event is occured, environment variables include all ++# information reported by tracepoint. ++ ++# environment: ++# BDF ++# EVENT_TYPE ++# DATA1 ++# DATA2 ++# ++ ++[ -x ./nvgpu_reset_trigger.local ] && . ./nvgpu_reset_trigger.local ++ ++if [ -d nvgpu_reset_trigger.extern ] ++then ++ ls nvgpu_reset_trigger.extern | ++ while read item ++ do ++ [ -x ./nvgpu_reset_trigger.extern/$item ] && . ./nvgpu_reset_trigger.extern/$item ++ done ++fi ++ ++if [ "$EVENT_TYPE" == "8" ] && [ "$DATA1" == "48" ] ++then ++ sudo nvidia-smi -r -i $BDF ++fi ++ ++if [ "$EVENT_TYPE" == "2" ] ++then ++ sudo nvidia-smi -r -i $BDF ++fi ++ ++if [ "$EVENT_TYPE" == "37" ] && [ "$DATA1" == "48" ] ++then ++ sudo nvidia-smi -r -i $BDF ++fi ++ ++exit 0 +diff --git a/contrib/rasdaemon.init b/contrib/rasdaemon.init +index d575af9..5fde6c8 100644 +--- a/contrib/rasdaemon.init ++++ b/contrib/rasdaemon.init +@@ -13,6 +13,10 @@ case "$target" in + sed -i 's/^PRE_PAGE_OFFLINE_TRIGGER=.*/PRE_PAGE_OFFLINE_TRIGGER="page_offline_pre_trigger"/g' ${ENV_PATH} + sed -i 's/^POST_PAGE_OFFLINE_TRIGGER=.*/POST_PAGE_OFFLINE_TRIGGER="page_offline_post_trigger"/g' ${ENV_PATH} + ;; ++ nvgpu_reset) ++ sed -i 's/^TRIGGER_DIR=.*/TRIGGER_DIR="\/etc\/ras\/triggers"/g' ${ENV_PATH} ++ sed -i 's/^NVGPU_TRIGGER=.*/NVGPU_TRIGGER="nvgpu_reset_trigger"/g' ${ENV_PATH} ++ ;; + jituan) + sed -i 's/json_report,kmsg_monitor,//' ${ENV_PATH} + sed -i 's/^AMDGPU_MCA_ENABLED=.*/AMDGPU_MCA_ENABLED=1/g' ${ENV_PATH} +-- +2.43.5 + diff --git a/1029-anolis-add-trigger-for-nvgpu-event.patch b/1029-anolis-add-trigger-for-nvgpu-event.patch new file mode 100644 index 0000000000000000000000000000000000000000..7e1c34b13c48eddea6ac10b0e11263c1c6f36ed7 --- /dev/null +++ b/1029-anolis-add-trigger-for-nvgpu-event.patch @@ -0,0 +1,201 @@ +From 03cd59d6aafbd14ed29ce2f9a73d0bbd8f8b23d3 Mon Sep 17 00:00:00 2001 +From: Ruidong Tian +Date: Fri, 25 Apr 2025 10:20:16 +0800 +Subject: [PATCH 29/30] anolis: add trigger for nvgpu event + +Signed-off-by: Ruidong Tian +--- + contrib/nvgpu_trigger | 25 +++++++++++++++++++++++++ + misc/rasdaemon.env | 3 +++ + ras-nvgpu-nvml.c | 8 +++++++- + ras-nvgpu.c | 3 +++ + trigger.c | 35 +++++++++++++++++++++++++++++++++++ + trigger.h | 1 + + 9 files changed, 80 insertions(+), 2 deletions(-) + create mode 100755 contrib/nvgpu_trigger + +diff --git a/contrib/nvgpu_trigger b/contrib/nvgpu_trigger +new file mode 100755 +index 0000000..48955af +--- /dev/null ++++ b/contrib/nvgpu_trigger +@@ -0,0 +1,25 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++# This shell script can be executed by rasdaemon in daemon mode when a ++# memory_failure_event is occured, environment variables include all ++# information reported by tracepoint. ++ ++# environment: ++# BDF ++# EVENT_TYPE ++# DATA1 ++# DATA2 ++# ++ ++[ -x ./nvgpu_trigger.local ] && . ./nvgpu_trigger.local ++ ++if [ -d nvgpu_trigger.extern ] ++then ++ ls nvgpu_trigger.extern | ++ while read item ++ do ++ [ -x ./nvgpu_trigger.extern/$item ] && . ./nvgpu_trigger.extern/$item ++ done ++fi ++ ++exit 0 +diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env +index 198b050..b08afa6 100644 +--- a/misc/rasdaemon.env ++++ b/misc/rasdaemon.env +@@ -119,6 +119,9 @@ POST_PAGE_OFFLINE_TRIGGER_TIMEOUT=0 + KMSG_TRIGGER= + KMSG_TRIGGER_TIMEOUT=0 + ++NVGPU_TRIGGER= ++NVGPU_TRIGGER_TIMEOUT=0 ++ + # CE Statistic Threshold + # + # Specify the threshold of CE per second. +diff --git a/ras-nvgpu-nvml.c b/ras-nvgpu-nvml.c +index 541ff69..f2421a1 100644 +--- a/ras-nvgpu-nvml.c ++++ b/ras-nvgpu-nvml.c +@@ -4,6 +4,7 @@ + * Copyright (C) 2025 Alibaba Inc + */ + ++#include + #include + #include + #include +@@ -13,6 +14,7 @@ + #include "ras-nvgpu.h" + #include "trace-seq.h" + #include "types.h" ++#include "trigger.h" + + const char *lib_name[] = { + "/lib64/libnvidia-ml.so", +@@ -42,6 +44,7 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) + time_t now; + struct tm *tm; + char timestamp[64]; ++ char tmpbuf[64]; + + time(&now); + tm = localtime(&now); +@@ -66,7 +69,8 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) + trace_seq_printf(&s, "data: %lld ", data->eventData); + } + +- trace_seq_printf(&s, "pci_port: " NVML_DEVICE_PCI_BUS_ID_FMT " ", NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci)); ++ snprintf(tmpbuf, sizeof(tmpbuf), NVML_DEVICE_PCI_BUS_ID_FMT, NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(&pci)); ++ trace_seq_printf(&s, "pci_port: %s ", tmpbuf); + trace_seq_printf(&s, "gpu-i: %x ", data->gpuInstanceId); + trace_seq_printf(&s, "gpu-ci: %x ", data->computeInstanceId); + +@@ -76,6 +80,8 @@ static int report_ras_gpu_nvml(nvmlEventData_t *data, nvmlDevice_t *devices) + fflush(stdout); + trace_seq_destroy(&s); + ++ run_nvgpu_trigger(tmpbuf, data->eventType, data->eventData, 0); ++ + return 0; + } + +diff --git a/ras-nvgpu.c b/ras-nvgpu.c +index 4d39de2..37a8833 100644 +--- a/ras-nvgpu.c ++++ b/ras-nvgpu.c +@@ -15,6 +15,7 @@ + #include "ras-events.h" + #include "ras-logger.h" + #include "ras-nvgpu.h" ++#include "trigger.h" + void *ras_nvgpu_handle(void *arg) + { + (void)arg; +@@ -41,6 +42,8 @@ void *ras_nvgpu_handle(void *arg) + return NULL; + } + ++ setup_event_trigger("nvgpu_event"); ++ + while (retry--) { + if (ras_nvgpu_nvml_handle()) { + log(ALL, LOG_ERR, "NVGPU nvml handle retry %d\n", retry); +diff --git a/trigger.c b/trigger.c +index d410137..e113077 100644 +--- a/trigger.c ++++ b/trigger.c +@@ -101,6 +101,8 @@ struct event_trigger post_page_offline_trigger = {"page_offline", "POST_PAGE_OFF + + struct event_trigger kmsg_trigger = {"kmsg_monitor", "KMSG_TRIGGER"}; + ++struct event_trigger nvgpu_trigger = {"nvgpu_event", "NVGPU_TRIGGER"}; ++ + static struct event_trigger *event_triggers[] = { + &mc_ue_trigger, + #ifdef HAVE_MCE +@@ -122,6 +124,9 @@ static struct event_trigger *event_triggers[] = { + #ifdef HAVE_KMSG_MONITOR + &kmsg_trigger, + #endif ++#ifdef HAVE_NVGPU ++ &nvgpu_trigger, ++#endif + }; + + void setup_event_trigger(const char *event) +@@ -476,3 +481,33 @@ free: + free(env[i]); + } + ++void run_nvgpu_trigger(char *pci_bdf, int event_type, int data1, int data2) ++{ ++ char *env[MAX_ENV]; ++ int ei = 0; ++ struct event_trigger *trigger = &nvgpu_trigger; ++ ++ if (!trigger->path || !strcmp(trigger->path, "")) ++ return; ++ ++ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) ++ goto free; ++ if (asprintf(&env[ei++], "BDF=%s", pci_bdf) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "EVENT_TYPE=%d", event_type) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "DATA1=%d", data1) < 0) ++ goto free; ++ if (asprintf(&env[ei++], "DATA2=%d", data1) < 0) ++ goto free; ++ ++ env[ei] = NULL; ++ assert(ei < MAX_ENV); ++ ++ run_trigger(trigger, NULL, env); ++ ++free: ++ for (int i = 0; i < ei; i++) ++ free(env[i]); ++} ++ +diff --git a/trigger.h b/trigger.h +index b5a6c2c..2ea2b09 100644 +--- a/trigger.h ++++ b/trigger.h +@@ -29,6 +29,7 @@ void run_mf_event_trigger(struct ras_mf_event *e); + void run_aer_event_trigger(struct ras_aer_event *e); + void run_page_offline_trigger(unsigned long long addr, int otype, int type); + void run_kmsg_trigger(struct kmsg_tracer_info *kmsg_tracer, const char *msg); ++void run_nvgpu_trigger(char *pci_bdf, int event_type, int data1, int data2); + + + #endif +-- +2.43.5 + diff --git a/1030-anolis-print-logs-in-the-same-line.patch b/1030-anolis-print-logs-in-the-same-line.patch deleted file mode 100644 index 9f4f295b69d00e193d5834ef4a40ce5beeacc6ec..0000000000000000000000000000000000000000 --- a/1030-anolis-print-logs-in-the-same-line.patch +++ /dev/null @@ -1,94 +0,0 @@ -From 45306973e2c357e9c39f4a423371d2c1ce9ebc12 Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Tue, 12 Mar 2024 14:28:55 +0800 -Subject: [PATCH 30/85] anolis: print logs in the same line - -Signed-off-by: Mauro Carvalho Chehab -[Ruidong: fix conflict with clean code] ---- - ras-arm-handler.c | 16 ++++++++-------- - ras-non-standard-handler.c | 4 ++-- - 2 files changed, 10 insertions(+), 10 deletions(-) - -diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index a0dfc51..abd8c9b 100644 ---- a/ras-arm-handler.c -+++ b/ras-arm-handler.c -@@ -180,7 +180,7 @@ int ras_arm_event_handler(struct trace_seq *s, - if (tm) - strftime(ev.timestamp, sizeof(ev.timestamp), - "%Y-%m-%d %H:%M:%S %z", tm); -- trace_seq_printf(s, "%s\n", ev.timestamp); -+ trace_seq_printf(s, "%s", ev.timestamp); - - if (pevent_get_field_val(s, event, "affinity", record, &val, 1) < 0) - return -1; -@@ -190,27 +190,27 @@ int ras_arm_event_handler(struct trace_seq *s, - if (pevent_get_field_val(s, event, "mpidr", record, &val, 1) < 0) - return -1; - ev.mpidr = val; -- trace_seq_printf(s, "\n MPIDR: 0x%llx", (unsigned long long)ev.mpidr); -+ trace_seq_printf(s, " MPIDR: 0x%llx", (unsigned long long)ev.mpidr); - - if (pevent_get_field_val(s, event, "midr", record, &val, 1) < 0) - return -1; - ev.midr = val; -- trace_seq_printf(s, "\n MIDR: 0x%llx", (unsigned long long)ev.midr); -+ trace_seq_printf(s, " MIDR: 0x%llx", (unsigned long long)ev.midr); - - if (pevent_get_field_val(s, event, "running_state", record, &val, 1) < 0) - return -1; - ev.running_state = val; -- trace_seq_printf(s, "\n running_state: %d", ev.running_state); -+ trace_seq_printf(s, " running_state: %d", ev.running_state); - - if (pevent_get_field_val(s, event, "psci_state", record, &val, 1) < 0) - return -1; - ev.psci_state = val; -- trace_seq_printf(s, "\n psci_state: %d", ev.psci_state); -+ trace_seq_printf(s, " psci_state: %d", ev.psci_state); - - if (pevent_get_field_val(s, event, "pei_len", record, &val, 1) < 0) - return -1; - ev.pei_len = val; -- trace_seq_printf(s, "\n ARM Processor Err Info data len: %d\n", -+ trace_seq_printf(s, " ARM Processor Err Info data len: %d\n", - ev.pei_len); - - ev.pei_error = pevent_get_field_raw(s, event, "buf", record, &len, 1); -@@ -221,7 +221,7 @@ int ras_arm_event_handler(struct trace_seq *s, - if (pevent_get_field_val(s, event, "ctx_len", record, &val, 1) < 0) - return -1; - ev.ctx_len = val; -- trace_seq_printf(s, "\n ARM Processor Err Context Info data len: %d\n", -+ trace_seq_printf(s, " ARM Processor Err Context Info data len: %d\n", - ev.ctx_len); - - ev.ctx_error = pevent_get_field_raw(s, event, "buf1", record, &len, 1); -@@ -232,7 +232,7 @@ int ras_arm_event_handler(struct trace_seq *s, - if (pevent_get_field_val(s, event, "oem_len", record, &val, 1) < 0) - return -1; - ev.oem_len = val; -- trace_seq_printf(s, "\n Vendor Specific Err Info data len: %d\n", -+ trace_seq_printf(s, " Vendor Specific Err Info data len: %d\n", - ev.oem_len); - - ev.vsei_error = pevent_get_field_raw(s, event, "buf2", record, &len, 1); -diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c -index 8672b16..8efb660 100644 ---- a/ras-non-standard-handler.c -+++ b/ras-non-standard-handler.c -@@ -168,8 +168,8 @@ int ras_non_standard_event_handler(struct trace_seq *s, - return -1; - if (strcmp(uuid_le(ev.sec_type), - "e8ed898d-df16-43cc-8ecc-54f060ef157f") == 0) -- trace_seq_printf(s, "\n section type: %s", -- "Ampere Specific Error\n"); -+ trace_seq_printf(s, " section type: %s", -+ "Ampere Specific Error"); - else - trace_seq_printf(s, " section type: %s", - uuid_le(ev.sec_type)); --- -2.33.1 - diff --git a/1031-rasdaemon-ras-memory-failure-handler-update-memory-f.patch b/1031-rasdaemon-ras-memory-failure-handler-update-memory-f.patch deleted file mode 100644 index 16d936953febcd3eebd76d1fc015e62a4c845018..0000000000000000000000000000000000000000 --- a/1031-rasdaemon-ras-memory-failure-handler-update-memory-f.patch +++ /dev/null @@ -1,60 +0,0 @@ -From bf1839dcc477ee8065c2edfbe07b685cbaf4274e Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Tue, 6 Feb 2024 12:08:00 +0000 -Subject: [PATCH 31/85] rasdaemon: ras-memory-failure-handler: update memory - failure action page types - -Update memory failure action page types corresponding to the same in -mm/memory-failure.c in the kernel. - -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - ras-memory-failure-handler.c | 6 ------ - 1 file changed, 6 deletions(-) - -diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c -index 1951456..adbd736 100644 ---- a/ras-memory-failure-handler.c -+++ b/ras-memory-failure-handler.c -@@ -27,10 +27,8 @@ enum mf_action_page_type { - MF_MSG_KERNEL_HIGH_ORDER, - MF_MSG_SLAB, - MF_MSG_DIFFERENT_COMPOUND, -- MF_MSG_POISONED_HUGE, - MF_MSG_HUGE, - MF_MSG_FREE_HUGE, -- MF_MSG_NON_PMD_HUGE, - MF_MSG_UNMAP_FAILED, - MF_MSG_DIRTY_SWAPCACHE, - MF_MSG_CLEAN_SWAPCACHE, -@@ -42,7 +40,6 @@ enum mf_action_page_type { - MF_MSG_CLEAN_LRU, - MF_MSG_TRUNCATED_LRU, - MF_MSG_BUDDY, -- MF_MSG_BUDDY_2ND, - MF_MSG_DAX, - MF_MSG_UNSPLIT_THP, - MF_MSG_UNKNOWN, -@@ -65,10 +62,8 @@ static const struct { - { MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page"}, - { MF_MSG_SLAB, "kernel slab page"}, - { MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking"}, -- { MF_MSG_POISONED_HUGE, "huge page already hardware poisoned"}, - { MF_MSG_HUGE, "huge page"}, - { MF_MSG_FREE_HUGE, "free huge page"}, -- { MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page"}, - { MF_MSG_UNMAP_FAILED, "unmapping failed page"}, - { MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page"}, - { MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page"}, -@@ -80,7 +75,6 @@ static const struct { - { MF_MSG_CLEAN_LRU, "clean LRU page"}, - { MF_MSG_TRUNCATED_LRU, "already truncated LRU page"}, - { MF_MSG_BUDDY, "free buddy page"}, -- { MF_MSG_BUDDY_2ND, "free buddy page (2nd try)"}, - { MF_MSG_DAX, "dax page"}, - { MF_MSG_UNSPLIT_THP, "unsplit thp"}, - { MF_MSG_UNKNOWN, "unknown page"}, --- -2.33.1 - diff --git a/1032-rasdaemon-ras-mc-ctl-Add-support-to-display-mcastatu.patch b/1032-rasdaemon-ras-mc-ctl-Add-support-to-display-mcastatu.patch deleted file mode 100644 index 5b8fe9091ad4b28d5212526b0bdf89a95002063a..0000000000000000000000000000000000000000 --- a/1032-rasdaemon-ras-mc-ctl-Add-support-to-display-mcastatu.patch +++ /dev/null @@ -1,60 +0,0 @@ -From e2efcbe5b1627c577f8d133b958114c34539e459 Mon Sep 17 00:00:00 2001 -From: Avadhut Naik -Date: Mon, 25 Mar 2024 23:06:08 -0500 -Subject: [PATCH 32/85] rasdaemon: ras-mc-ctl: Add support to display - mcastatus_msg string - -Currently, the mcastatus_msg string of struct mce_event is added to the -SQLite database by the rasdaemon when it is recording errors. The same -however, is not outputted by the ras-mc-ctl utility. - -The string provides important error information relating to the received -MCE. For example, on AMD SMCA systems, the string outputs extended error -code and description. As such, the string should be present in the -output of ras-mc-ctl utility. - -Add support to output the string through the ras-mc-ctl utility. - -Signed-off-by: Avadhut Naik -Signed-off-by: Mauro Carvalho Chehab ---- - util/ras-mc-ctl.in | 7 ++++--- - 1 file changed, 4 insertions(+), 3 deletions(-) - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index fb35afe..725d215 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -1329,7 +1329,7 @@ sub errors - { - require DBI; - my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out); -- my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location); -+ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location); - my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data); - my ($bus_name, $dev_name, $driver_name, $reporter_name); - my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd); -@@ -1497,10 +1497,10 @@ sub errors - - # MCE mce_record errors - if ($has_mce == 1) { -- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record$conf{opt}{since} order by id"; -+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record$conf{opt}{since} order by id"; - $query_handle = $dbh->prepare($query); - $query_handle->execute(); -- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location)); -+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location)); - $out = ""; - while($query_handle->fetch()) { - $out .= "$id $time error: $msg"; -@@ -1508,6 +1508,7 @@ sub errors - $out .= ", bank $bank_name" if ($bank_name); - $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg); - $out .= ", mci $mcistatus_msg" if ($mcistatus_msg); -+ $out .= ", mca $mcastatus_msg" if ($mcastatus_msg); - $out .= ", $mc_location" if ($mc_location); - $out .= ", $user_action" if ($user_action); - $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap); --- -2.33.1 - diff --git a/1033-rasdaemon-fix-table-create-if-some-cpus-are-offline.patch b/1033-rasdaemon-fix-table-create-if-some-cpus-are-offline.patch deleted file mode 100644 index 0a0bfd5c3bc962cf47fd76c88bcb2fa71a257295..0000000000000000000000000000000000000000 --- a/1033-rasdaemon-fix-table-create-if-some-cpus-are-offline.patch +++ /dev/null @@ -1,177 +0,0 @@ -From bbcf65669b1efda78bb4a4762bfb4e3886d4f371 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Sun, 5 Mar 2023 23:14:42 +0000 -Subject: [PATCH 33/85] rasdaemon: fix table create if some cpus are offline - -Fix for regression in ras_mc_create_table() if some cpus are offline -at the system start - -Issue: - -Regression in the ras_mc_create_table() if some of the cpus are offline -at the system start when run the rasdaemon. - -This issue is reproducible in ras_mc_create_table() with decode and -record non-standard events and reproducible sometimes with -ras_mc_create_table() for the standard events. - -Also in the multi thread way, there is memory leak in ras_mc_event_opendb() -as struct sqlite3_priv *priv and sqlite3 *db allocated/initialized per -thread, but stored in the common struct ras_events ras in pthread data, -which is shared across the threads. - -Reason: - -when the system starts with some of the cpus offline and then run -the rasdaemon, read_ras_event_all_cpus() exit with error and switch to -the multi thread way. However read() in read_ras_event() return error in -threads for each of the offline CPUs and does clean up including calling -ras_mc_event_closedb(). - -Since the 'struct ras_events ras' passed in the pthread_data to each of the -threads is common, struct sqlite3_priv *priv and sqlite3 *db allocated/ -initialized per thread and stored in the common 'struct ras_events ras', -are getting overwritten in each ras_mc_event_opendb()(which called from -pthread per cpu), result memory leak. - -Also when ras_mc_event_closedb() is called in the above error case from -the threads corresponding to the offline cpus, close the sqlite3 *db and -free sqlite3_priv *priv stored in the common 'struct ras_events ras', -result regression when accessing priv->db in the ras_mc_create_table() -from another context later. - -Solution: - -In ras_mc_event_opendb(), allocate struct sqlite3_priv *priv, -init sqlite3 *db and create tables common for the threads with shared -'struct ras_events ras' based on a reference count and free them in the -same way. - -Also protect critical code ras_mc_event_opendb() and ras_mc_event_closedb() -using mutex in the multi thread case from any regression caused by the -thread pre-emption. - -Reported-by: Lei Feng -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - ras-events.c | 16 +++++++++++++++- - ras-events.h | 4 +++- - ras-record.c | 12 ++++++++++++ - 3 files changed, 30 insertions(+), 2 deletions(-) - -diff --git a/ras-events.c b/ras-events.c -index 31a4e0b..a5ff661 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -630,19 +630,25 @@ static void *handle_ras_events_cpu(void *priv) - - log(TERM, LOG_INFO, "Listening to events on cpu %d\n", pdata->cpu); - if (pdata->ras->record_events) { -+ pthread_mutex_lock(&pdata->ras->db_lock); - if (ras_mc_event_opendb(pdata->cpu, pdata->ras)) { -+ pthread_mutex_unlock(&pdata->ras->db_lock); - log(TERM, LOG_ERR, "Can't open database\n"); - close(fd); - kbuffer_free(kbuf); - free(page); - return 0; - } -+ pthread_mutex_unlock(&pdata->ras->db_lock); - } - - read_ras_event(fd, pdata, kbuf, page); - -- if (pdata->ras->record_events) -+ if (pdata->ras->record_events) { -+ pthread_mutex_lock(&pdata->ras->db_lock); - ras_mc_event_closedb(pdata->cpu, pdata->ras); -+ pthread_mutex_unlock(&pdata->ras->db_lock); -+ } - - close(fd); - kbuffer_free(kbuf); -@@ -992,6 +998,11 @@ int handle_ras_events(int record_events) - - /* Poll doesn't work on this kernel. Fallback to pthread way */ - if (rc == -255) { -+ if (pthread_mutex_init(&ras->db_lock, NULL) != 0) { -+ log(SYSLOG, LOG_INFO, "sqlite db lock init has failed\n"); -+ goto err; -+ } -+ - log(SYSLOG, LOG_INFO, - "Opening one thread per cpu (%d threads)\n", cpus); - for (i = 0; i < cpus; i++) { -@@ -1004,6 +1015,8 @@ int handle_ras_events(int record_events) - i); - while (--i) - pthread_cancel(data[i].thread); -+ -+ pthread_mutex_destroy(&ras->db_lock); - goto err; - } - } -@@ -1011,6 +1024,7 @@ int handle_ras_events(int record_events) - /* Wait for all threads to complete */ - for (i = 0; i < cpus; i++) - pthread_join(data[i].thread, NULL); -+ pthread_mutex_destroy(&ras->db_lock); - } - - log(SYSLOG, LOG_INFO, "Huh! something got wrong. Aborting.\n"); -diff --git a/ras-events.h b/ras-events.h -index 4e36726..73f6bbb 100644 ---- a/ras-events.h -+++ b/ras-events.h -@@ -56,7 +56,9 @@ struct ras_events { - time_t uptime_diff; - - /* For ras-record */ -- void *db_priv; -+ void *db_priv; -+ int db_ref_count; -+ pthread_mutex_t db_lock; - - /* For the mce handler */ - struct mce_priv *mce_priv; -diff --git a/ras-record.c b/ras-record.c -index 8f61d40..adb00ca 100644 ---- a/ras-record.c -+++ b/ras-record.c -@@ -763,6 +763,10 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) - - printf("Calling %s()\n", __FUNCTION__); - -+ ras->db_ref_count++; -+ if (ras->db_ref_count > 1) -+ return 0; -+ - ras->db_priv = NULL; - - priv = calloc(1, sizeof(*priv)); -@@ -912,6 +916,13 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) - - printf("Calling %s()\n", __func__); - -+ if (ras->db_ref_count > 0) -+ ras->db_ref_count--; -+ else -+ return -1; -+ if (ras->db_ref_count > 0) -+ return 0; -+ - if (!priv) - return -1; - -@@ -1018,6 +1029,7 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) - log(TERM, LOG_ERR, - "cpu %u: Failed to shutdown sqlite: error = %d\n", cpu, rc); - free(priv); -+ ras->db_priv = NULL; - - return 0; - } --- -2.33.1 - diff --git a/1034-rasdaemon-fix-return-value-type-issue-of-read-write-.patch b/1034-rasdaemon-fix-return-value-type-issue-of-read-write-.patch deleted file mode 100644 index e4bc686d9e26434f8b2fcf7337779466107ebac3..0000000000000000000000000000000000000000 --- a/1034-rasdaemon-fix-return-value-type-issue-of-read-write-.patch +++ /dev/null @@ -1,96 +0,0 @@ -From 2ef01c6e146f9a806fad6d4bdc17578c85e76d34 Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Thu, 11 May 2023 10:54:26 +0800 -Subject: [PATCH 34/85] rasdaemon: fix return value type issue of read/write - function from unistd.h - -The return value type of read/write function from unistd.h is ssize_t. -It's signed normally, and return -1 on error. Fix incorrect use in the -function read_ras_event_all_cpus(). - -BTW, make setting buffer_percent as a separate function. - -Fixes: 94750bcf9309 ("rasdaemon: Fix poll() on per_cpu trace_pipe_raw blocks indefinitely") -Signed-off-by: Xiaofei Tan -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - ras-events.c | 45 ++++++++++++++++++++++++++++++--------------- - 1 file changed, 30 insertions(+), 15 deletions(-) - -diff --git a/ras-events.c b/ras-events.c -index a5ff661..fc8faec 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -366,10 +366,37 @@ static int get_num_cpus(struct ras_events *ras) - #endif - } - -+static int set_buffer_percent(struct ras_events *ras, int percent) -+{ -+ char buf[16]; -+ ssize_t size; -+ int res = 0; -+ int fd; -+ -+ fd = open_trace(ras, "buffer_percent", O_WRONLY); -+ if (fd >= 0) { -+ /* For the backward compatibility to the old kernels, do not return -+ * if fail to set the buffer_percent. -+ */ -+ snprintf(buf, sizeof(buf), "%d", percent); -+ size = write(fd, buf, strlen(buf)); -+ if (size <= 0) { -+ log(TERM, LOG_WARNING, "can't write to buffer_percent\n"); -+ res = -1; -+ } -+ close(fd); -+ } else { -+ log(TERM, LOG_WARNING, "Can't open buffer_percent\n"); -+ res = -1; -+ } -+ -+ return res; -+} -+ - static int read_ras_event_all_cpus(struct pthread_data *pdata, - unsigned n_cpus) - { -- unsigned size; -+ ssize_t size; - unsigned long long time_stamp; - void *data; - int ready, i, count_nready; -@@ -381,8 +408,6 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, - int warnonce[n_cpus]; - char pipe_raw[PATH_MAX]; - int legacy_kernel = 0; -- int fd; -- char buf[16]; - #if 0 - int need_sleep = 0; - #endif -@@ -409,18 +434,8 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, - * Set buffer_percent to 0 so that poll() will return immediately - * when the trace data is available in the ras per_cpu trace pipe_raw - */ -- fd = open_trace(pdata[0].ras, "buffer_percent", O_WRONLY); -- if (fd >= 0) { -- /* For the backward compatibility to the old kernels, do not return -- * if fail to set the buffer_percent. -- */ -- snprintf(buf, sizeof(buf), "0"); -- size = write(fd, buf, strlen(buf)); -- if (size <= 0) -- log(TERM, LOG_WARNING, "can't write to buffer_percent\n"); -- close(fd); -- } else -- log(TERM, LOG_WARNING, "Can't open buffer_percent\n"); -+ if (set_buffer_percent(pdata[0].ras, 0)) -+ log(TERM, LOG_WARNING, "Set buffer_percent failed\n"); - - for (i = 0; i < (n_cpus + 1); i++) - fds[i].fd = -1; --- -2.33.1 - diff --git a/1035-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch b/1035-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch deleted file mode 100644 index 594b0c03d75c0ff41f766702a6a37b58ba021c59..0000000000000000000000000000000000000000 --- a/1035-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch +++ /dev/null @@ -1,110 +0,0 @@ -From 5727ef175dcafa012e04c8bc991d876ea29bbc66 Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Tue, 30 May 2023 11:44:12 +0100 -Subject: [PATCH 35/85] rasdaemon: fix issue of signed and unsigned integer - comparison and remove redundant header file - -1. The return value of ARRAY_SIZE() is unsigned integer. It isn't right to -compare it with a signed integer. This patch fix them. - -2. Remove redundant header file and adjust the header files sequence. - -Signed-off-by: Xiaofei Tan -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - non-standard-hisi_hip08.c | 2 +- - non-standard-hisilicon.c | 8 ++++---- - ras-diskerror-handler.c | 2 +- - ras-memory-failure-handler.c | 5 ++--- - 4 files changed, 8 insertions(+), 9 deletions(-) - -diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c -index 4ef47ea..61f12eb 100644 ---- a/non-standard-hisi_hip08.c -+++ b/non-standard-hisi_hip08.c -@@ -1029,7 +1029,7 @@ static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = { - - static void __attribute__((constructor)) hip08_init(void) - { -- int i; -+ unsigned int i; - - for (i = 0; i < ARRAY_SIZE(hip08_ns_ev_decoder); i++) - register_ns_ev_decoder(&hip08_ns_ev_decoder[i]); -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index 2b00ed6..721821e 100644 ---- a/non-standard-hisilicon.c -+++ b/non-standard-hisilicon.c -@@ -366,13 +366,13 @@ static int decode_hisi_common_section(struct ras_events *ras, - trace_seq_printf(s, "%s\n", hevent.error_msg); - - if (err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE) && err->reg_array_size > 0) { -- int i; -+ unsigned int i; - - trace_seq_printf(s, "Register Dump:\n"); - for (i = 0; i < err->reg_array_size / sizeof(uint32_t); i++) { -- trace_seq_printf(s, "reg%02d=0x%08x\n", i, -+ trace_seq_printf(s, "reg%02u=0x%08x\n", i, - err->reg_array[i]); -- HISI_SNPRINTF(hevent.reg_msg, "reg%02d=0x%08x", -+ HISI_SNPRINTF(hevent.reg_msg, "reg%02u=0x%08x", - i, err->reg_array[i]); - } - } -@@ -398,7 +398,7 @@ static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = { - - static void __attribute__((constructor)) hisi_ns_init(void) - { -- int i; -+ unsigned int i; - - for (i = 0; i < ARRAY_SIZE(hisi_section_ns_ev_decoder); i++) - register_ns_ev_decoder(&hisi_section_ns_ev_decoder[i]); -diff --git a/ras-diskerror-handler.c b/ras-diskerror-handler.c -index b16319f..b46f859 100644 ---- a/ras-diskerror-handler.c -+++ b/ras-diskerror-handler.c -@@ -52,7 +52,7 @@ static const struct { - - static const char *get_blk_error(int err) - { -- int i; -+ unsigned int i; - - for (i = 0; i < ARRAY_SIZE(blk_errors); i++) - if (blk_errors[i].error == err) -diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c -index adbd736..4798ead 100644 ---- a/ras-memory-failure-handler.c -+++ b/ras-memory-failure-handler.c -@@ -16,7 +16,6 @@ - #include - #include - #include "libtrace/kbuffer.h" --#include "ras-memory-failure-handler.h" - #include "ras-record.h" - #include "ras-logger.h" - #include "ras-report.h" -@@ -93,7 +92,7 @@ static const struct { - - static const char *get_page_type(int page_type) - { -- int i; -+ unsigned int i; - - for (i = 0; i < ARRAY_SIZE(mf_page_type); i++) - if (mf_page_type[i].type == page_type) -@@ -104,7 +103,7 @@ static const char *get_page_type(int page_type) - - static const char *get_action_result(int result) - { -- int i; -+ unsigned int i; - - for (i = 0; i < ARRAY_SIZE(mf_action_result); i++) - if (mf_action_result[i].result == result) --- -2.33.1 - diff --git a/1036-rasdaemon-Add-support-for-creating-the-vendor-error-.patch b/1036-rasdaemon-Add-support-for-creating-the-vendor-error-.patch deleted file mode 100644 index 8b995cf06163e03d75c2c6294facd9b841907d29..0000000000000000000000000000000000000000 --- a/1036-rasdaemon-Add-support-for-creating-the-vendor-error-.patch +++ /dev/null @@ -1,343 +0,0 @@ -From eae25e65c6dbbd21797c42c490c427754943d688 Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Wed, 31 May 2023 16:24:36 +0100 -Subject: [PATCH 36/85] rasdaemon: Add support for creating the vendor error - tables at startup - -1. Support for create/open the vendor error tables at rasdaemon startup. -2. Make changes in the HiSilicon error handling code for the same. - -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - non-standard-hisi_hip08.c | 76 ++++++++++++++++++++++---------------- - non-standard-hisilicon.c | 28 +++++++++----- - ras-events.c | 17 ++++++++- - ras-non-standard-handler.c | 35 +++++++++++++++++- - ras-non-standard-handler.h | 3 ++ - 5 files changed, 116 insertions(+), 43 deletions(-) - -diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c -index 61f12eb..0899812 100644 ---- a/non-standard-hisi_hip08.c -+++ b/non-standard-hisi_hip08.c -@@ -654,6 +654,20 @@ static void decode_oem_type1_err_regs(struct ras_ns_ev_decoder *ev_decoder, - step_vendor_data_tab(ev_decoder, "hip08_oem_type1_event_tab"); - } - -+static int add_hip08_oem_type1_table(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder) -+{ -+#ifdef HAVE_SQLITE3 -+ if (ras->record_events && !ev_decoder->stmt_dec_record) { -+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, -+ &hip08_oem_type1_event_tab) != SQLITE_OK) { -+ log(TERM, LOG_WARNING, "Failed to create sql hip08_oem_type1_event_tab\n"); -+ return -1; -+ } -+ } -+#endif -+ return 0; -+} -+ - /* error data decoding functions */ - static int decode_hip08_oem_type1_error(struct ras_events *ras, - struct ras_ns_ev_decoder *ev_decoder, -@@ -669,17 +683,6 @@ static int decode_hip08_oem_type1_error(struct ras_events *ras, - return -1; - } - --#ifdef HAVE_SQLITE3 -- if (ras->record_events && !ev_decoder->stmt_dec_record) { -- if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, -- &hip08_oem_type1_event_tab) -- != SQLITE_OK) { -- trace_seq_printf(s, -- "create sql hip08_oem_type1_event_tab fail\n"); -- return -1; -- } -- } --#endif - record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_OEM_TYPE1_FIELD_TIMESTAMP, - 0, event->timestamp); -@@ -827,6 +830,20 @@ static void decode_oem_type2_err_regs(struct ras_ns_ev_decoder *ev_decoder, - step_vendor_data_tab(ev_decoder, "hip08_oem_type2_event_tab"); - } - -+static int add_hip08_oem_type2_table(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder) -+{ -+#ifdef HAVE_SQLITE3 -+ if (ras->record_events && !ev_decoder->stmt_dec_record) { -+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, -+ &hip08_oem_type2_event_tab) != SQLITE_OK) { -+ log(TERM, LOG_WARNING, "Failed to create sql hip08_oem_type2_event_tab\n"); -+ return -1; -+ } -+ } -+#endif -+ return 0; -+} -+ - static int decode_hip08_oem_type2_error(struct ras_events *ras, - struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, -@@ -841,16 +858,6 @@ static int decode_hip08_oem_type2_error(struct ras_events *ras, - return -1; - } - --#ifdef HAVE_SQLITE3 -- if (ras->record_events && !ev_decoder->stmt_dec_record) { -- if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, -- &hip08_oem_type2_event_tab) != SQLITE_OK) { -- trace_seq_printf(s, -- "create sql hip08_oem_type2_event_tab fail\n"); -- return -1; -- } -- } --#endif - record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_OEM_TYPE2_FIELD_TIMESTAMP, - 0, event->timestamp); -@@ -977,6 +984,20 @@ static void decode_pcie_local_err_regs(struct ras_ns_ev_decoder *ev_decoder, - step_vendor_data_tab(ev_decoder, "hip08_pcie_local_event_tab"); - } - -+static int add_hip08_pcie_local_table(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder) -+{ -+#ifdef HAVE_SQLITE3 -+ if (ras->record_events && !ev_decoder->stmt_dec_record) { -+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, -+ &hip08_pcie_local_event_tab) != SQLITE_OK) { -+ log(TERM, LOG_WARNING, "Failed to create sql hip08_pcie_local_event_tab\n"); -+ return -1; -+ } -+ } -+#endif -+ return 0; -+} -+ - static int decode_hip08_pcie_local_error(struct ras_events *ras, - struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, -@@ -991,16 +1012,6 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras, - return -1; - } - --#ifdef HAVE_SQLITE3 -- if (ras->record_events && !ev_decoder->stmt_dec_record) { -- if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, -- &hip08_pcie_local_event_tab) != SQLITE_OK) { -- trace_seq_printf(s, -- "create sql hip08_pcie_local_event_tab fail\n"); -- return -1; -- } -- } --#endif - record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HIP08_PCIE_LOCAL_FIELD_TIMESTAMP, - 0, event->timestamp); -@@ -1015,14 +1026,17 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras, - static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = { - { - .sec_type = "1f8161e1-55d6-41e6-bd10-7afd1dc5f7c5", -+ .add_table = add_hip08_oem_type1_table, - .decode = decode_hip08_oem_type1_error, - }, - { - .sec_type = "45534ea6-ce23-4115-8535-e07ab3aef91d", -+ .add_table = add_hip08_oem_type2_table, - .decode = decode_hip08_oem_type2_error, - }, - { - .sec_type = "b2889fc9-e7d7-4f9d-a867-af42e98be772", -+ .add_table = add_hip08_pcie_local_table, - .decode = decode_hip08_pcie_local_error, - }, - }; -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index 721821e..7296d28 100644 ---- a/non-standard-hisilicon.c -+++ b/non-standard-hisilicon.c -@@ -341,6 +341,23 @@ static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder, - HISI_SNPRINTF(event->error_msg, "]"); - } - -+static int add_hisi_common_table(struct ras_events *ras, -+ struct ras_ns_ev_decoder *ev_decoder) -+{ -+#ifdef HAVE_SQLITE3 -+ if (ras->record_events && -+ !ev_decoder->stmt_dec_record) { -+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, -+ &hisi_common_section_tab) != SQLITE_OK) { -+ log(TERM, LOG_WARNING, "Failed to create sql hisi_common_section_tab\n"); -+ return -1; -+ } -+ } -+#endif -+ -+ return 0; -+} -+ - static int decode_hisi_common_section(struct ras_events *ras, - struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, -@@ -350,16 +367,6 @@ static int decode_hisi_common_section(struct ras_events *ras, - (struct hisi_common_error_section *)event->error; - struct hisi_event hevent; - --#ifdef HAVE_SQLITE3 -- if (ras->record_events && !ev_decoder->stmt_dec_record) { -- if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, -- &hisi_common_section_tab) != SQLITE_OK) { -- trace_seq_printf(s, "create sql hisi_common_section_tab fail\n"); -- return -1; -- } -- } --#endif -- - memset(&hevent, 0, sizeof(struct hisi_event)); - trace_seq_printf(s, "\nHisilicon Common Error Section:\n"); - decode_hisi_common_section_hdr(ev_decoder, err, &hevent); -@@ -392,6 +399,7 @@ static int decode_hisi_common_section(struct ras_events *ras, - static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = { - { - .sec_type = "c8b328a8-9917-4af6-9a13-2e08ab2e7586", -+ .add_table = add_hisi_common_table, - .decode = decode_hisi_common_section, - }, - }; -diff --git a/ras-events.c b/ras-events.c -index fc8faec..8d70e43 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -472,6 +472,10 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, - if (pdata[0].ras->record_events) { - if (ras_mc_event_opendb(pdata[0].cpu, pdata[0].ras)) - goto error; -+#ifdef HAVE_NON_STANDARD -+ if (ras_ns_add_vendor_tables(pdata[0].ras)) -+ log(TERM, LOG_ERR, "Can't add vendor table\n"); -+#endif - } - - do { -@@ -556,8 +560,12 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, - "Old kernel detected. Stop listening and fall back to pthread way.\n"); - - cleanup: -- if (pdata[0].ras->record_events) -+ if (pdata[0].ras->record_events) { -+#ifdef HAVE_NON_STANDARD -+ ras_ns_finalize_vendor_tables(); -+#endif - ras_mc_event_closedb(pdata[0].cpu, pdata[0].ras); -+ } - - error: - kbuffer_free(kbuf); -@@ -654,6 +662,10 @@ static void *handle_ras_events_cpu(void *priv) - free(page); - return 0; - } -+#ifdef HAVE_NON_STANDARD -+ if (ras_ns_add_vendor_tables(pdata->ras)) -+ log(TERM, LOG_ERR, "Can't add vendor table\n"); -+#endif - pthread_mutex_unlock(&pdata->ras->db_lock); - } - -@@ -661,6 +673,9 @@ static void *handle_ras_events_cpu(void *priv) - - if (pdata->ras->record_events) { - pthread_mutex_lock(&pdata->ras->db_lock); -+#ifdef HAVE_NON_STANDARD -+ ras_ns_finalize_vendor_tables(); -+#endif - ras_mc_event_closedb(pdata->cpu, pdata->ras); - pthread_mutex_unlock(&pdata->ras->db_lock); - } -diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c -index 8efb660..762993b 100644 ---- a/ras-non-standard-handler.c -+++ b/ras-non-standard-handler.c -@@ -75,6 +75,32 @@ int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) - return 0; - } - -+int ras_ns_add_vendor_tables(struct ras_events *ras) -+{ -+ struct ras_ns_ev_decoder *ns_ev_decoder; -+ int error = 0; -+ -+#ifdef HAVE_SQLITE3 -+ if (!ras) -+ return -1; -+ -+ ns_ev_decoder = ras_ns_ev_dec_list; -+ while (ns_ev_decoder) { -+ if (ns_ev_decoder->add_table && !ns_ev_decoder->stmt_dec_record) { -+ error = ns_ev_decoder->add_table(ras, ns_ev_decoder); -+ if (error) -+ break; -+ } -+ ns_ev_decoder = ns_ev_decoder->next; -+ } -+ -+ if (error) -+ return -1; -+#endif -+ -+ return 0; -+} -+ - static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p_ns_ev_dec) - { - struct ras_ns_ev_decoder *ns_ev_decoder; -@@ -96,7 +122,7 @@ static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p - return 0; - } - --static void unregister_ns_ev_decoder(void) -+void ras_ns_finalize_vendor_tables(void) - { - #ifdef HAVE_SQLITE3 - struct ras_ns_ev_decoder *ns_ev_decoder = ras_ns_ev_dec_list; -@@ -108,6 +134,13 @@ static void unregister_ns_ev_decoder(void) - } - ns_ev_decoder = ns_ev_decoder->next; - } -+#endif -+} -+ -+static void unregister_ns_ev_decoder(void) -+{ -+#ifdef HAVE_SQLITE3 -+ ras_ns_finalize_vendor_tables(); - #endif - ras_ns_ev_dec_list = NULL; - } -diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h -index 393b756..834f84a 100644 ---- a/ras-non-standard-handler.h -+++ b/ras-non-standard-handler.h -@@ -20,6 +20,7 @@ - struct ras_ns_ev_decoder { - struct ras_ns_ev_decoder *next; - const char *sec_type; -+ int (*add_table)(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder); - int (*decode)(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder, - struct trace_seq *s, struct ras_non_standard_event *event); - #ifdef HAVE_SQLITE3 -@@ -36,6 +37,8 @@ void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index); - - #ifdef HAVE_NON_STANDARD - int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder); -+int ras_ns_add_vendor_tables(struct ras_events *ras); -+void ras_ns_finalize_vendor_tables(void); - #else - static inline int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) { return 0; }; - #endif --- -2.33.1 - diff --git a/1037-rasdaemon-Fix-for-vendor-errors-are-not-recorded-in-.patch b/1037-rasdaemon-Fix-for-vendor-errors-are-not-recorded-in-.patch deleted file mode 100644 index 95d7ce5bad0b51039a3f71c4ab0f8997b757fc6b..0000000000000000000000000000000000000000 --- a/1037-rasdaemon-Fix-for-vendor-errors-are-not-recorded-in-.patch +++ /dev/null @@ -1,102 +0,0 @@ -From de5ee630f0009195e115e478e3bb79f6e9fc3a9a Mon Sep 17 00:00:00 2001 -From: Shiju Jose -Date: Wed, 20 Mar 2024 12:16:05 +0000 -Subject: [PATCH 37/85] rasdaemon: Fix for vendor errors are not recorded in - the SQLite database if some cpus are offline - -Fix for vendor errors are not recorded in the SQLite database if some cpus -are offline at the system start. - -Issue: - -This issue is reproducible by offline some cpus, run -./rasdaemon -f --record & and -inject vendor specific error supported in the rasdaemon. - -Reason: - -When the system starts with some of the cpus offline and then run -the rasdaemon, read_ras_event_all_cpus() exit with error and switch to -the multi thread way. However read() in read_ras_event() return error in -threads for each of the offline CPUs and does clean up including calling -ras_ns_finalize_vendor_tables(), which invokes sqlite3_finalize() on vendor -tables created. Thus the vendor error data does not stored in the SQLite -database when such error is reported next time. - -Solution: - -In ras_ns_add_vendor_tables() and ras_ns_finalize_vendor_tables() use -reference count and close vendor tables which created in ras_ns_add_vendor_tables() -based on the reference count. - -Reported-by: Junhao He -Signed-off-by: Shiju Jose -Signed-off-by: Mauro Carvalho Chehab ---- - ras-non-standard-handler.c | 16 ++++++++++++++++ - ras-non-standard-handler.h | 1 + - 2 files changed, 17 insertions(+) - -diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c -index 762993b..3a4e300 100644 ---- a/ras-non-standard-handler.c -+++ b/ras-non-standard-handler.c -@@ -65,6 +65,7 @@ int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) - #endif - if (!ras_ns_ev_dec_list) { - ras_ns_ev_dec_list = ns_ev_decoder; -+ ras_ns_ev_dec_list->ref_count = 0; - } else { - list = ras_ns_ev_dec_list; - while (list->next) -@@ -85,6 +86,8 @@ int ras_ns_add_vendor_tables(struct ras_events *ras) - return -1; - - ns_ev_decoder = ras_ns_ev_dec_list; -+ if (ras_ns_ev_dec_list) -+ ras_ns_ev_dec_list->ref_count++; - while (ns_ev_decoder) { - if (ns_ev_decoder->add_table && !ns_ev_decoder->stmt_dec_record) { - error = ns_ev_decoder->add_table(ras, ns_ev_decoder); -@@ -127,6 +130,16 @@ void ras_ns_finalize_vendor_tables(void) - #ifdef HAVE_SQLITE3 - struct ras_ns_ev_decoder *ns_ev_decoder = ras_ns_ev_dec_list; - -+ if (!ras_ns_ev_dec_list) -+ return; -+ -+ if (ras_ns_ev_dec_list->ref_count > 0) -+ ras_ns_ev_dec_list->ref_count--; -+ else -+ return; -+ if (ras_ns_ev_dec_list->ref_count > 0) -+ return; -+ - while (ns_ev_decoder) { - if (ns_ev_decoder->stmt_dec_record) { - ras_mc_finalize_vendor_table(ns_ev_decoder->stmt_dec_record); -@@ -140,6 +153,9 @@ void ras_ns_finalize_vendor_tables(void) - static void unregister_ns_ev_decoder(void) - { - #ifdef HAVE_SQLITE3 -+ if (!ras_ns_ev_dec_list) -+ return; -+ ras_ns_ev_dec_list->ref_count = 1; - ras_ns_finalize_vendor_tables(); - #endif - ras_ns_ev_dec_list = NULL; -diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h -index 834f84a..735ea76 100644 ---- a/ras-non-standard-handler.h -+++ b/ras-non-standard-handler.h -@@ -19,6 +19,7 @@ - - struct ras_ns_ev_decoder { - struct ras_ns_ev_decoder *next; -+ uint16_t ref_count; - const char *sec_type; - int (*add_table)(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder); - int (*decode)(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder, --- -2.33.1 - diff --git a/1038-rasdaemon-Update-SMCA-bank-error-descriptions.patch b/1038-rasdaemon-Update-SMCA-bank-error-descriptions.patch deleted file mode 100644 index 076231e65bffb69d58e86ee0ed442cca5b44d07c..0000000000000000000000000000000000000000 --- a/1038-rasdaemon-Update-SMCA-bank-error-descriptions.patch +++ /dev/null @@ -1,139 +0,0 @@ -From 70ce037486c1c166a253bc0015f6efd1e7f0955a Mon Sep 17 00:00:00 2001 -From: Avadhut Naik -Date: Fri, 10 May 2024 13:20:19 -0500 -Subject: [PATCH 38/85] rasdaemon: Update SMCA bank error descriptions - -Update error descriptions of SMCA bank types to support AMD's new Family -1Ah-based processors. -Also, modify some existing error descriptions to better reflect the error -received. - -Signed-off-by: Avadhut Naik -Signed-off-by: Mauro Carvalho Chehab ---- - mce-amd-smca.c | 32 +++++++++++++++++++++++++++++--- - 1 file changed, 29 insertions(+), 3 deletions(-) - -diff --git a/mce-amd-smca.c b/mce-amd-smca.c -index 233fa0a..3fd97e0 100644 ---- a/mce-amd-smca.c -+++ b/mce-amd-smca.c -@@ -108,7 +108,7 @@ static const char * const smca_ls_mce_desc[] = { - "Store queue parity", - "Miss address buffer payload parity", - "L1 TLB parity", -- "Reserved", -+ "DC Tag error type 5", - "DC tag error type 6", - "DC tag error type 1", - "Internal error type 1", -@@ -125,6 +125,12 @@ static const char * const smca_ls_mce_desc[] = { - "DC tag error type 3", - "DC tag error type 5", - "L2 fill data error", -+ "Error on SCB cacheline state or address field", -+ "Error on SCB data, commit pipe 0", -+ "Error on SCB data, commit pipe 1", -+ "Error on SCB data for non-cacheable DRAM or IO", -+ "System Read Data Error detected by write combine buffer", -+ "Hardware Asserts", - }; - - static const char * const smca_ls2_mce_desc[] = { -@@ -168,7 +174,7 @@ static const char * const smca_if_mce_desc[] = { - "BP L1-BTB Multi-Hit Error", - "BP L2-BTB Multi-Hit Error", - "L2 Cache Response Poison error", -- "L2 Cache Error Response", -+ "System Read Data error", - "Hardware Assertion Error", - "L1-TLB Multi-Hit", - "L2-TLB Multi-Hit", -@@ -182,6 +188,7 @@ static const char * const smca_l2_mce_desc[] = { - "L2M Data Array ECC Error", - "Hardware Assert Error", - "SDP Read Response Parity Error", -+ "Error initiated by programmable state machine", - }; - - static const char * const smca_de_mce_desc[] = { -@@ -193,7 +200,7 @@ static const char * const smca_de_mce_desc[] = { - "Fetch address FIFO parity error", - "Patch RAM data parity error", - "Patch RAM sequencer parity error", -- "Micro-op buffer parity error", -+ "Micro-op fetch queue parity error", - "Hardware Assertion MCA Error", - }; - -@@ -235,6 +242,7 @@ static const char * const smca_l3_mce_desc[] = { - "L3 victim queue Data Fabric error", - "L3 Hardware Assertion", - "XI WCB Parity Poison Creation event", -+ "Machine check error initiated by DSM action", - }; - - static const char * const smca_cs_mce_desc[] = { -@@ -268,6 +276,9 @@ static const char * const smca_cs2_mce_desc[] = { - "Address Violation on the no data channel", - "Security Violation on the no data channel", - "Hardware Assert Error", -+ "Shadow Tag Array Protocol Error", -+ "Shadow Tag ECC Error", -+ "Shadow Tag Transaction Error", - }; - - /* -@@ -303,6 +314,8 @@ static const char * const smca_pie_mce_desc[] = { - "A deferred error was detected in the DF", - "Watch Dog Timer", - "An SRAM ECC error was detected in the CNLI block", -+ "Register access during DF Cstate", -+ "DSM Error", - }; - - static const char * const smca_umc_mce_desc[] = { -@@ -318,6 +331,11 @@ static const char * const smca_umc_mce_desc[] = { - "ECS Error", - "UMC Throttling Error", - "Read CRC Error", -+ "Reserved", -+ "Reserved", -+ "Reserved", -+ "Reserved", -+ "RFM SRAM ECC error", - }; - - static const char * const smca_umc_quirk_mce_desc[] = { -@@ -391,6 +409,12 @@ static const char * const smca_psp2_mce_desc[] = { - "TLB Bank 0 parity error", - "TLB Bank 1 parity error", - "System Hub Read Buffer ECC or parity error", -+ "FUSE IP SRAM ECC or parity error", -+ "PCRU FUSE SRAM ECC or parity error", -+ "SIB SRAM parity error", -+ "mpASP SECEMC Error", -+ "mpASP A5 Hang", -+ "SIB WDT error", - }; - - static const char * const smca_smu_mce_desc[] = { -@@ -430,6 +454,7 @@ static const char * const smca_mp5_mce_desc[] = { - "Instruction Cache Bank B ECC or parity error", - "Instruction Tag Cache Bank A ECC or parity error", - "Instruction Tag Cache Bank B ECC or parity error", -+ "Fuse SRAM ECC or parity error", - }; - - static const char * const smca_mpdma_mce_desc[] = { -@@ -482,6 +507,7 @@ static const char * const smca_mpdma_mce_desc[] = { - "MPDMA PTE Internal Data FIFO ECC or parity error", - "MPDMA PTE Command Memory DMA ECC or parity error", - "MPDMA PTE Command Memory Internal ECC or parity error", -+ "MPDMA TVF SDP Master Memory 7 ECC or parity error", - }; - - static const char * const smca_nbio_mce_desc[] = { --- -2.33.1 - diff --git a/1039-rasdaemon-Add-Corrected-Internal-Error-for-aer_cor_e.patch b/1039-rasdaemon-Add-Corrected-Internal-Error-for-aer_cor_e.patch deleted file mode 100644 index a2bb30e577a6e8b8f72f015f3cb958980ee0feda..0000000000000000000000000000000000000000 --- a/1039-rasdaemon-Add-Corrected-Internal-Error-for-aer_cor_e.patch +++ /dev/null @@ -1,30 +0,0 @@ -From dd0f10dd1a7c128b7c20f4d9bb1e469e3cbabacc Mon Sep 17 00:00:00 2001 -From: Jesus Esquivel -Date: Mon, 3 Jun 2024 16:47:20 -0600 -Subject: [PATCH 39/85] rasdaemon: Add Corrected Internal Error for - aer_cor_errors - -Add "Corrected Internal Error" for aer_cor_errors to decode -the error reported in status register in bit 14. - -Signed-off-by: Jesus Esquivel -Signed-off-by: Mauro Carvalho Chehab ---- - ras-aer-handler.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/ras-aer-handler.c b/ras-aer-handler.c -index 6f4cb2b..d6898e0 100644 ---- a/ras-aer-handler.c -+++ b/ras-aer-handler.c -@@ -35,6 +35,7 @@ static const char *aer_cor_errors[32] = { - [8] = "RELAY_NUM Rollover", - [12] = "Replay Timer Timeout", - [13] = "Advisory Non-Fatal", -+ [14] = "Corrected Internal Error", - }; - - /* bit field meaning for uncorrectable error */ --- -2.33.1 - diff --git a/1040-anolis-ras-arm-handler-be-compatible-with-upstream-K.patch b/1040-anolis-ras-arm-handler-be-compatible-with-upstream-K.patch deleted file mode 100644 index 7970d2fcfc544b8d989a9b53baa9c6ba2a5e9bd7..0000000000000000000000000000000000000000 --- a/1040-anolis-ras-arm-handler-be-compatible-with-upstream-K.patch +++ /dev/null @@ -1,123 +0,0 @@ -From 26fa7f2122282920f4d3963db5f664d5f1a381a3 Mon Sep 17 00:00:00 2001 -From: Mauro Carvalho Chehab -Date: Tue, 25 Jun 2024 10:05:45 +0200 -Subject: [PATCH 40/85] anolis: ras-arm-handler: be compatible with upstream - Kernel - -Changeset e37eb2f11a82 ("Add code to decode Ampere specific error") -broke ARM event record with upstream Kernel, as it requires a different -trace event than the one that it is on upstream Kernel, and it is -part of a pending pull request: - - https://lore.kernel.org/all/20240321-b4-arm-ras-error-vendor-info-v5-rc3-v5-0-850f9bfb97a8@os.amperecomputing.com/ - -Restore its behavior by making parsing the UEFI 2.6+ N.17 and N.16 -table extra fields to be optional. That should make it compatible -with current upstream Kernels again. - -Fixes: e37eb2f11a82 ("Add code to decode Ampere specific error") -Signed-off-by: Mauro Carvalho Chehab -[Ruidong: do not use libtraceevent] ---- - ras-arm-handler.c | 78 ++++++++++++++++++++++++----------------------- - 1 file changed, 40 insertions(+), 38 deletions(-) - -diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index abd8c9b..731176d 100644 ---- a/ras-arm-handler.c -+++ b/ras-arm-handler.c -@@ -207,51 +207,53 @@ int ras_arm_event_handler(struct trace_seq *s, - ev.psci_state = val; - trace_seq_printf(s, " psci_state: %d", ev.psci_state); - -- if (pevent_get_field_val(s, event, "pei_len", record, &val, 1) < 0) -- return -1; -- ev.pei_len = val; -- trace_seq_printf(s, " ARM Processor Err Info data len: %d\n", -- ev.pei_len); -- -- ev.pei_error = pevent_get_field_raw(s, event, "buf", record, &len, 1); -- if (!ev.pei_error) -- return -1; -- display_raw_data(s, ev.pei_error, ev.pei_len); -- -- if (pevent_get_field_val(s, event, "ctx_len", record, &val, 1) < 0) -- return -1; -- ev.ctx_len = val; -- trace_seq_printf(s, " ARM Processor Err Context Info data len: %d\n", -- ev.ctx_len); -- -- ev.ctx_error = pevent_get_field_raw(s, event, "buf1", record, &len, 1); -- if (!ev.ctx_error) -- return -1; -- display_raw_data(s, ev.ctx_error, ev.ctx_len); -- -- if (pevent_get_field_val(s, event, "oem_len", record, &val, 1) < 0) -- return -1; -- ev.oem_len = val; -- trace_seq_printf(s, " Vendor Specific Err Info data len: %d\n", -- ev.oem_len); -- -- ev.vsei_error = pevent_get_field_raw(s, event, "buf2", record, &len, 1); -- if (!ev.vsei_error) -- return -1; -+ // Upstream kKernels up to version 6.10 don't decode UEFI 2.6+ N.17 table -+ if (pevent_get_field_val(s, event, "pei_len", record, &val, 1) >= 0) { -+ -+ ev.pei_len = val; -+ trace_seq_printf(s, " ARM Processor Err Info data len: %d\n", -+ ev.pei_len); -+ -+ ev.pei_error = pevent_get_field_raw(s, event, "buf", record, &len, 1); -+ if (!ev.pei_error) -+ return -1; -+ display_raw_data(s, ev.pei_error, ev.pei_len); -+ -+ if (pevent_get_field_val(s, event, "ctx_len", record, &val, 1) < 0) -+ return -1; -+ ev.ctx_len = val; -+ trace_seq_printf(s, " ARM Processor Err Context Info data len: %d\n", -+ ev.ctx_len); -+ -+ ev.ctx_error = pevent_get_field_raw(s, event, "buf1", record, &len, 1); -+ if (!ev.ctx_error) -+ return -1; -+ display_raw_data(s, ev.ctx_error, ev.ctx_len); -+ -+ if (pevent_get_field_val(s, event, "oem_len", record, &val, 1) < 0) -+ return -1; -+ ev.oem_len = val; -+ trace_seq_printf(s, " Vendor Specific Err Info data len: %d\n", -+ ev.oem_len); -+ -+ ev.vsei_error = pevent_get_field_raw(s, event, "buf2", record, &len, 1); -+ if (!ev.vsei_error) -+ return -1; - - #ifdef HAVE_AMP_NS_DECODE -- //decode ampere specific error -- decode_amp_payload0_err_regs(NULL, s, -- (struct amp_payload0_type_sec *)ev.vsei_error); -+ //decode ampere specific error -+ decode_amp_payload0_err_regs(NULL, s, -+ (struct amp_payload0_type_sec *)ev.vsei_error); - #else -- display_raw_data(s, ev.vsei_error, ev.oem_len); -+ display_raw_data(s, ev.vsei_error, ev.oem_len); - #endif -- - #ifdef HAVE_CPU_FAULT_ISOLATION -- if (ras_handle_cpu_error(s, record, event, &ev, now) < 0) -- return -1; -+ if (ras_handle_cpu_error(s, record, event, &ev, now) < 0) -+ printf("Can't do CPU fault isolation!\n"); - #endif - -+ } -+ - /* Insert data into the SGBD */ - #ifdef HAVE_SQLITE3 - ras_store_arm_record(ras, &ev); --- -2.33.1 - diff --git a/1041-rasdaemon-add-mc_event-trigger.patch b/1041-rasdaemon-add-mc_event-trigger.patch deleted file mode 100644 index 0e3ae606e1b5868b11200ce785f71c9733ca03b2..0000000000000000000000000000000000000000 --- a/1041-rasdaemon-add-mc_event-trigger.patch +++ /dev/null @@ -1,382 +0,0 @@ -From 2ec86db2ab6c31670ba038b175f53aba920b7fe3 Mon Sep 17 00:00:00 2001 -From: Ruidong Tian -Date: Thu, 23 Nov 2023 17:47:25 +0800 -Subject: [PATCH 41/85] rasdaemon: add mc_event trigger - -Allow users to run a trigger when RAS mc_event occurs, The mc_event -trigger is separated into CE trigger and UE trigger, this is because -CE is more frequent than UE, and the CE trigger will lead to more -performance hits. Users can choose different triggers for CE/UE to -reduce this effect. - -Users can config trigger in /etc/sysconfig/rasdaemon: - - TRIGGER_DIR: The trigger diretory - MC_CE_TRIGGER: The script executed when corrected error occurs. - MC_UE_TRIGGER: The script executed when uncorrected error occurs. - -No script will be executed if MC_CE_TRIGGER/MC_UE_TRIGGER is null. - -Signed-off-by: Ruidong Tian -Signed-off-by: Mauro Carvalho Chehab ---- - Makefile.am | 8 ++-- - contrib/mc_event_trigger | 24 ++++++++++++ - misc/rasdaemon.env | 18 ++++++++- - ras-events.c | 17 +++++++++ - ras-mc-handler.c | 81 ++++++++++++++++++++++++++++++++++++++++ - ras-mc-handler.h | 2 + - trigger.c | 60 +++++++++++++++++++++++++++++ - trigger.h | 13 +++++++ - 8 files changed, 218 insertions(+), 5 deletions(-) - create mode 100755 contrib/mc_event_trigger - create mode 100644 trigger.c - create mode 100644 trigger.h - -diff --git a/Makefile.am b/Makefile.am -index b16cf34..735d5a7 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -17,7 +17,7 @@ all-local: $(SYSTEMD_SERVICES) - - sbin_PROGRAMS = rasdaemon - rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ -- bitfield.c -+ bitfield.c trigger.c - if WITH_SQLITE3 - rasdaemon_SOURCES += ras-record.c - endif -@@ -77,7 +77,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ - ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ - non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ -- ras-cpu-isolation.h queue.h non-standard-yitian.h -+ ras-cpu-isolation.h queue.h non-standard-yitian.h trigger.h - - # This rule can't be called with more than one Makefile job (like make -j8) - # I can't figure out a way to fix that -@@ -104,6 +104,6 @@ upload: - # custom target - install-data-local: - $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/dimm_labels.d" --if WITH_MEMORY_CE_PFA -+ $(install_sh) -d "$(DESTDIR)@sysconfdir@/ras/triggers" - $(install_sh) @abs_srcdir@/misc/rasdaemon.env "$(DESTDIR)@SYSCONFDEFDIR@/rasdaemon" --endif -+ $(install_sh) @abs_srcdir@/contrib/mc_event_trigger "$(DESTDIR)@sysconfdir@/ras/triggers/mc_event_trigger" -diff --git a/contrib/mc_event_trigger b/contrib/mc_event_trigger -new file mode 100755 -index 0000000..5c6ccfa ---- /dev/null -+++ b/contrib/mc_event_trigger -@@ -0,0 +1,24 @@ -+#!/bin/sh -+# This shell script can be executed by rasdaemon in daemon mode when a -+# mc_event is occured, environment variables include all information -+# reported by tracepoint. -+# -+# environment: -+# TIMESTAMP Timestamp when error occurred -+# COUNT Number of errors of the same type -+# TYPE Error type from Corrected/Uncorrected -+# MESSAGE Error message -+# LABEL Label of the affected DIMM(s) -+# MC_INDEX DIMM identifier from DMI/SMBIOS if available -+# TOP_LAYER Top layer of the error -+# MIDDLE_LAYER Middle layer of the error -+# LOWER_LAYER Low layer of the error -+# ADDRESS Error address -+# GRAIN Minimum granularity for an error report, in bytes -+# SYNDROME Syndrome of the error (or 0 if unknown or if the syndrome is not applicable) -+# DRIVER_DETAIL Other driver-specific detail about the error -+# -+ -+[ -x ./mc_event_trigger.local ] && . ./mc_event_trigger.local -+ -+exit 0 -diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env -index 7cb18e8..3389a73 100644 ---- a/misc/rasdaemon.env -+++ b/misc/rasdaemon.env -@@ -43,4 +43,20 @@ CPU_CE_THRESHOLD="18" - CPU_ISOLATION_CYCLE="24h" - - # Prevent excessive isolation from causing an avalanche effect --CPU_ISOLATION_LIMIT="10" -\ No newline at end of file -+CPU_ISOLATION_LIMIT="10" -+ -+# Event Trigger -+ -+# Event trigger will be executed when the specified event occurs. -+# -+# Execute triggers path -+# For example: TRIGGER_DIR=/etc/ras/triggers -+TRIGGER_DIR= -+ -+# Execute these triggers when the mc_event occured, the triggers will not -+# be executed if the trigger is not specified. -+# For example: -+# MC_CE_TRIGGER=mc_event_trigger -+# MC_UE_TRIGGER=mc_event_trigger -+MC_CE_TRIGGER= -+MC_UE_TRIGGER= -diff --git a/ras-events.c b/ras-events.c -index 8d70e43..b071de9 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -42,6 +42,7 @@ - #include "ras-logger.h" - #include "ras-page-isolation.h" - #include "ras-cpu-isolation.h" -+#include "trigger.h" - - /* - * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -@@ -59,6 +60,10 @@ - - extern char* choices_disable; - -+const static struct event_trigger event_triggers[] = { -+ { "mc_event", &mc_event_trigger_setup }, -+}; -+ - static int get_debugfs_dir(char *tracing_dir, size_t len) - { - FILE *fp; -@@ -257,6 +262,16 @@ free_ras: - return rc; - } - -+static void setup_event_trigger(char *event) -+{ -+ struct event_trigger trigger; -+ for (int i = 0; i < ARRAY_SIZE(event_triggers); i++) { -+ trigger = event_triggers[i]; -+ if (!strcmp(event, trigger.name)) -+ trigger.setup(); -+ } -+} -+ - /* - * Set kernel filter. libtrace doesn't provide an API for setting filters - * in kernel, we have to implement it here. -@@ -842,6 +857,8 @@ static int add_event_handler(struct ras_events *ras, struct pevent *pevent, - return EINVAL; - } - -+ setup_event_trigger(event); -+ - log(ALL, LOG_INFO, "Enabled event %s:%s\n", group, event); - - return 0; -diff --git a/ras-mc-handler.c b/ras-mc-handler.c -index 42b05cd..d857ca3 100644 ---- a/ras-mc-handler.c -+++ b/ras-mc-handler.c -@@ -15,16 +15,91 @@ - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -+#define _GNU_SOURCE - #include - #include - #include - #include - #include "libtrace/kbuffer.h" -+#include - #include "ras-mc-handler.h" - #include "ras-record.h" - #include "ras-logger.h" - #include "ras-page-isolation.h" - #include "ras-report.h" -+#include "trigger.h" -+ -+#define MAX_ENV 30 -+static char *mc_ce_trigger; -+static char *mc_ue_trigger; -+ -+void mc_event_trigger_setup(void) -+{ -+ mc_ce_trigger = getenv("MC_CE_TRIGGER"); -+ if (!mc_ce_trigger || !strcmp(mc_ce_trigger, "") -+ || trigger_check(mc_ce_trigger) < 0) { -+ log(SYSLOG, LOG_ERR, "Cannot access mc_event ce trigger `%s`\n", -+ mc_ce_trigger); -+ } else -+ log(SYSLOG, LOG_INFO, "Setup mc_event ce trigger `%s`\n", -+ mc_ce_trigger); -+ -+ mc_ue_trigger = getenv("MC_UE_TRIGGER"); -+ if (!mc_ue_trigger || !strcmp(mc_ue_trigger, "") -+ || trigger_check(mc_ue_trigger) < 0) { -+ log(SYSLOG, LOG_ERR, "Cannot access mc_event ue trigger `%s`\n", -+ mc_ue_trigger); -+ } else -+ log(SYSLOG, LOG_INFO, "Setup mc_event ue trigger `%s`\n", -+ mc_ue_trigger); -+} -+ -+static void run_mc_trigger(struct ras_mc_event *ev, const char *mc_trigger) -+{ -+ char *env[MAX_ENV]; -+ int ei = 0; -+ int i; -+ -+ if (!mc_trigger || !strcmp(mc_trigger, "")) -+ return; -+ -+ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) -+ goto free; -+ if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "COUNT=%d", ev->error_count) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "TYPE=%s", ev->error_type) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "MESSAGE=%s", ev->msg) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "LABEL=%s", ev->label) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "MC_INDEX=%d", ev->mc_index) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "TOP_LAYER=%d", ev->top_layer) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "MIDDLE_LAYER=%d", ev->middle_layer) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "LOWER_LAYER=%d", ev->lower_layer) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "ADDRESS=%llx", ev->address) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "GRAIN=%lld", ev->grain) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "SYNDROME=%llx", ev->syndrome) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "DRIVER_DETAIL=%s", ev->driver_detail) < 0) -+ goto free; -+ env[ei] = NULL; -+ assert(ei < MAX_ENV); -+ -+ run_trigger(mc_trigger, NULL, env, "mc_event"); -+ -+free: -+ for (i = 0; i < ei; i++) -+ free(env[i]); -+} - - int ras_mc_event_handler(struct trace_seq *s, - struct pevent_record *record, -@@ -195,6 +270,12 @@ int ras_mc_event_handler(struct trace_seq *s, - ras_report_mc_event(ras, &ev); - #endif - -+ if (!strcmp(ev.error_type, "Corrected")) -+ run_mc_trigger(&ev, mc_ce_trigger); -+ -+ if (!strcmp(ev.error_type, "Uncorrected")) -+ run_mc_trigger(&ev, mc_ue_trigger); -+ - return 0; - - parse_error: -diff --git a/ras-mc-handler.h b/ras-mc-handler.h -index 2e3dfc5..dc5c545 100644 ---- a/ras-mc-handler.h -+++ b/ras-mc-handler.h -@@ -22,6 +22,8 @@ - #include "ras-events.h" - #include "libtrace/event-parse.h" - -+void mc_event_trigger_setup(void); -+ - int ras_mc_event_handler(struct trace_seq *s, - struct pevent_record *record, - struct event_format *event, void *context); -diff --git a/trigger.c b/trigger.c -new file mode 100644 -index 0000000..95fb8ca ---- /dev/null -+++ b/trigger.c -@@ -0,0 +1,60 @@ -+#define _GNU_SOURCE -+#include -+#include -+#include -+#include -+#include "ras-logger.h" -+#include "trigger.h" -+ -+void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter) -+{ -+ pid_t child; -+ char *path; -+ int status; -+ char *trigger_dir = getenv("TRIGGER_DIR"); -+ -+ log(SYSLOG, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", trigger, reporter); -+ -+ if (asprintf(&path, "%s/%s", trigger_dir, trigger) < 0) -+ return; -+ -+ child = fork(); -+ if (child < 0) { -+ log(SYSLOG, LOG_ERR, "Cannot create process for trigger"); -+ return; -+ } -+ -+ if (child == 0) { -+ execve(path, argv, env); -+ _exit(127); -+ } else { -+ waitpid(child, &status, 0); -+ if (WIFEXITED(status) && WEXITSTATUS(status)) { -+ log(SYSLOG, LOG_INFO, "Trigger %s exited with status %d", -+ trigger, WEXITSTATUS(status)); -+ } else if (WIFSIGNALED(status)) { -+ log(SYSLOG, LOG_INFO, "Trigger %s killed by signal %d", -+ trigger, WTERMSIG(status)); -+ } -+ } -+} -+ -+int trigger_check(char *s) -+{ -+ char *name; -+ int rc; -+ char *trigger_dir = getenv("TRIGGER_DIR"); -+ -+ if (trigger_dir) { -+ if (asprintf(&name, "%s/%s", trigger_dir, s) < 0) -+ return -1; -+ } else -+ name = s; -+ -+ rc = access(name, R_OK|X_OK); -+ -+ if (trigger_dir) -+ free(name); -+ -+ return rc; -+} -diff --git a/trigger.h b/trigger.h -new file mode 100644 -index 0000000..556a7f2 ---- /dev/null -+++ b/trigger.h -@@ -0,0 +1,13 @@ -+#ifndef __TRIGGER_H__ -+#define __TRIGGER_H__ -+ -+struct event_trigger { -+ const char *name; -+ void (*setup)(void); -+}; -+ -+int trigger_check(char *s); -+void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter); -+ -+ -+#endif --- -2.33.1 - diff --git a/1042-ras-mc-handler-cleanup-trigger-logic.patch b/1042-ras-mc-handler-cleanup-trigger-logic.patch deleted file mode 100644 index 15f4cb985b91b44b2fdaf1301a436a99e2f41ea7..0000000000000000000000000000000000000000 --- a/1042-ras-mc-handler-cleanup-trigger-logic.patch +++ /dev/null @@ -1,190 +0,0 @@ -From dcfa32bc1266fa0eaa52a9b42aeff62e5a947cdd Mon Sep 17 00:00:00 2001 -From: Mauro Carvalho Chehab -Date: Tue, 16 Jul 2024 07:38:13 +0200 -Subject: [PATCH 42/85] ras-mc-handler: cleanup trigger logic - -- Only setup mc_ce_trigger/mc_ue_trigger if the trigger is - valid; - -- Check if the trigger is there before doing strcmp, as - checking if a pointer is not null is faster than strcmp(); - -- Ensure that the trigger env vars will be const, as we don't - want to accidentally override those env vars; - -- Print trigger enabled messages when rasdaemon runs with -f; - -- ensure that trigger variables will initialize to NULL; - -- coding style cleanups. - -Signed-off-by: Mauro Carvalho Chehab ---- - ras-events.c | 3 ++- - ras-mc-handler.c | 60 +++++++++++++++++++++++++++++------------------- - trigger.c | 4 ++-- - trigger.h | 3 +-- - 4 files changed, 41 insertions(+), 29 deletions(-) - -diff --git a/ras-events.c b/ras-events.c -index b071de9..c2eb8f0 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -60,7 +60,7 @@ - - extern char* choices_disable; - --const static struct event_trigger event_triggers[] = { -+static const struct event_trigger event_triggers[] = { - { "mc_event", &mc_event_trigger_setup }, - }; - -@@ -265,6 +265,7 @@ free_ras: - static void setup_event_trigger(char *event) - { - struct event_trigger trigger; -+ - for (int i = 0; i < ARRAY_SIZE(event_triggers); i++) { - trigger = event_triggers[i]; - if (!strcmp(event, trigger.name)) -diff --git a/ras-mc-handler.c b/ras-mc-handler.c -index d857ca3..203c5af 100644 ---- a/ras-mc-handler.c -+++ b/ras-mc-handler.c -@@ -16,42 +16,54 @@ - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - #define _GNU_SOURCE -+#include - #include - #include - #include - #include - #include "libtrace/kbuffer.h" --#include - #include "ras-mc-handler.h" --#include "ras-record.h" - #include "ras-logger.h" - #include "ras-page-isolation.h" -+#include "ras-record.h" - #include "ras-report.h" - #include "trigger.h" - - #define MAX_ENV 30 --static char *mc_ce_trigger; --static char *mc_ue_trigger; -+static const char *mc_ce_trigger = NULL; -+static const char *mc_ue_trigger = NULL; - - void mc_event_trigger_setup(void) - { -- mc_ce_trigger = getenv("MC_CE_TRIGGER"); -- if (!mc_ce_trigger || !strcmp(mc_ce_trigger, "") -- || trigger_check(mc_ce_trigger) < 0) { -- log(SYSLOG, LOG_ERR, "Cannot access mc_event ce trigger `%s`\n", -- mc_ce_trigger); -- } else -- log(SYSLOG, LOG_INFO, "Setup mc_event ce trigger `%s`\n", -- mc_ce_trigger); -- -- mc_ue_trigger = getenv("MC_UE_TRIGGER"); -- if (!mc_ue_trigger || !strcmp(mc_ue_trigger, "") -- || trigger_check(mc_ue_trigger) < 0) { -- log(SYSLOG, LOG_ERR, "Cannot access mc_event ue trigger `%s`\n", -- mc_ue_trigger); -- } else -- log(SYSLOG, LOG_INFO, "Setup mc_event ue trigger `%s`\n", -- mc_ue_trigger); -+ const char *trigger; -+ -+ trigger = getenv("MC_CE_TRIGGER"); -+ if (trigger && strcmp(trigger, "")) { -+ if (trigger_check(trigger) < 0) { -+ log(ALL, LOG_ERR, -+ "Cannot access mc_event ce trigger `%s`\n", -+ trigger); -+ } else { -+ log(ALL, LOG_INFO, -+ "Setup mc_event ce trigger `%s`\n", -+ trigger); -+ mc_ce_trigger = trigger; -+ } -+ } -+ -+ trigger = getenv("MC_UE_TRIGGER"); -+ if (trigger && strcmp(trigger, "")) { -+ if (trigger_check(trigger) < 0) { -+ log(ALL, LOG_ERR, -+ "Cannot access mc_event ue trigger `%s`\n", -+ trigger); -+ } else { -+ log(ALL, LOG_INFO, -+ "Setup mc_event ue trigger `%s`\n", -+ trigger); -+ mc_ue_trigger = trigger; -+ } -+ } - } - - static void run_mc_trigger(struct ras_mc_event *ev, const char *mc_trigger) -@@ -60,7 +72,7 @@ static void run_mc_trigger(struct ras_mc_event *ev, const char *mc_trigger) - int ei = 0; - int i; - -- if (!mc_trigger || !strcmp(mc_trigger, "")) -+ if (!strcmp(mc_trigger, "")) - return; - - if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) -@@ -270,10 +282,10 @@ int ras_mc_event_handler(struct trace_seq *s, - ras_report_mc_event(ras, &ev); - #endif - -- if (!strcmp(ev.error_type, "Corrected")) -+ if (mc_ce_trigger && !strcmp(ev.error_type, "Corrected")) - run_mc_trigger(&ev, mc_ce_trigger); - -- if (!strcmp(ev.error_type, "Uncorrected")) -+ if (mc_ue_trigger && !strcmp(ev.error_type, "Uncorrected")) - run_mc_trigger(&ev, mc_ue_trigger); - - return 0; -diff --git a/trigger.c b/trigger.c -index 95fb8ca..0d91e05 100644 ---- a/trigger.c -+++ b/trigger.c -@@ -6,7 +6,7 @@ - #include "ras-logger.h" - #include "trigger.h" - --void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter) -+void run_trigger(const char *trigger, char *argv[], char **env, const char *reporter) - { - pid_t child; - char *path; -@@ -39,7 +39,7 @@ void run_trigger(const char *trigger, char *argv[], char **env, const char* repo - } - } - --int trigger_check(char *s) -+int trigger_check(const char *s) - { - char *name; - int rc; -diff --git a/trigger.h b/trigger.h -index 556a7f2..ccd1a9b 100644 ---- a/trigger.h -+++ b/trigger.h -@@ -6,8 +6,7 @@ struct event_trigger { - void (*setup)(void); - }; - --int trigger_check(char *s); -+int trigger_check(const char *s); - void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter); - -- - #endif --- -2.33.1 - diff --git a/1043-trigger-parse-only-once-TRIGGER_DIR-env-variable.patch b/1043-trigger-parse-only-once-TRIGGER_DIR-env-variable.patch deleted file mode 100644 index 4f14bc27cd3523530694757600cae0ed8fcb9eac..0000000000000000000000000000000000000000 --- a/1043-trigger-parse-only-once-TRIGGER_DIR-env-variable.patch +++ /dev/null @@ -1,142 +0,0 @@ -From 092e251ac3a880284f72c45a42990734e33b4df1 Mon Sep 17 00:00:00 2001 -From: Mauro Carvalho Chehab -Date: Mon, 15 Jul 2024 13:40:37 +0200 -Subject: [PATCH 43/85] trigger: parse only once TRIGGER_DIR env variable - -Instead of parsing TRIGGER_DIR every time a new event happens, -store the trigger full path, simplifying the logic and avoiding -memory leaks. - -Signed-off-by: Mauro Carvalho Chehab ---- - ras-mc-handler.c | 13 ++++++------- - trigger.c | 23 +++++++++-------------- - trigger.h | 2 +- - 3 files changed, 16 insertions(+), 22 deletions(-) - -diff --git a/ras-mc-handler.c b/ras-mc-handler.c -index 203c5af..b62dfb6 100644 ---- a/ras-mc-handler.c -+++ b/ras-mc-handler.c -@@ -39,7 +39,9 @@ void mc_event_trigger_setup(void) - - trigger = getenv("MC_CE_TRIGGER"); - if (trigger && strcmp(trigger, "")) { -- if (trigger_check(trigger) < 0) { -+ mc_ce_trigger = trigger_check(trigger); -+ -+ if (!mc_ce_trigger) { - log(ALL, LOG_ERR, - "Cannot access mc_event ce trigger `%s`\n", - trigger); -@@ -47,13 +49,14 @@ void mc_event_trigger_setup(void) - log(ALL, LOG_INFO, - "Setup mc_event ce trigger `%s`\n", - trigger); -- mc_ce_trigger = trigger; - } - } - - trigger = getenv("MC_UE_TRIGGER"); - if (trigger && strcmp(trigger, "")) { -- if (trigger_check(trigger) < 0) { -+ mc_ue_trigger = trigger_check(trigger); -+ -+ if (!mc_ue_trigger) { - log(ALL, LOG_ERR, - "Cannot access mc_event ue trigger `%s`\n", - trigger); -@@ -61,7 +64,6 @@ void mc_event_trigger_setup(void) - log(ALL, LOG_INFO, - "Setup mc_event ue trigger `%s`\n", - trigger); -- mc_ue_trigger = trigger; - } - } - } -@@ -72,9 +74,6 @@ static void run_mc_trigger(struct ras_mc_event *ev, const char *mc_trigger) - int ei = 0; - int i; - -- if (!strcmp(mc_trigger, "")) -- return; -- - if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) - goto free; - if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0) -diff --git a/trigger.c b/trigger.c -index 0d91e05..0ee1a5e 100644 ---- a/trigger.c -+++ b/trigger.c -@@ -9,15 +9,10 @@ - void run_trigger(const char *trigger, char *argv[], char **env, const char *reporter) - { - pid_t child; -- char *path; - int status; -- char *trigger_dir = getenv("TRIGGER_DIR"); - - log(SYSLOG, LOG_INFO, "Running trigger `%s' (reporter: %s)\n", trigger, reporter); - -- if (asprintf(&path, "%s/%s", trigger_dir, trigger) < 0) -- return; -- - child = fork(); - if (child < 0) { - log(SYSLOG, LOG_ERR, "Cannot create process for trigger"); -@@ -25,7 +20,7 @@ void run_trigger(const char *trigger, char *argv[], char **env, const char *repo - } - - if (child == 0) { -- execve(path, argv, env); -+ execve(trigger, argv, env); - _exit(127); - } else { - waitpid(child, &status, 0); -@@ -39,7 +34,7 @@ void run_trigger(const char *trigger, char *argv[], char **env, const char *repo - } - } - --int trigger_check(const char *s) -+const char *trigger_check(const char *s) - { - char *name; - int rc; -@@ -47,14 +42,14 @@ int trigger_check(const char *s) - - if (trigger_dir) { - if (asprintf(&name, "%s/%s", trigger_dir, s) < 0) -- return -1; -- } else -- name = s; -+ return NULL; -+ s = name; -+ } - -- rc = access(name, R_OK|X_OK); -+ rc = access(s, R_OK|X_OK); - -- if (trigger_dir) -- free(name); -+ if (!rc) -+ return(s); - -- return rc; -+ return NULL; - } -diff --git a/trigger.h b/trigger.h -index ccd1a9b..aea29b5 100644 ---- a/trigger.h -+++ b/trigger.h -@@ -6,7 +6,7 @@ struct event_trigger { - void (*setup)(void); - }; - --int trigger_check(const char *s); -+const char *trigger_check(const char *s); - void run_trigger(const char *trigger, char *argv[], char **env, const char* reporter); - - #endif --- -2.33.1 - diff --git a/1044-rasdaemon-add-mem_fail_event-trigger.patch b/1044-rasdaemon-add-mem_fail_event-trigger.patch deleted file mode 100644 index 554dc5522b8dff7adab0ddd22baab77dc96c5d02..0000000000000000000000000000000000000000 --- a/1044-rasdaemon-add-mem_fail_event-trigger.patch +++ /dev/null @@ -1,150 +0,0 @@ -From aeeaf6b0275bfbe7e7fa2686bed8c2ae2b95bb8f Mon Sep 17 00:00:00 2001 -From: Mauro Carvalho Chehab -Date: Tue, 16 Jul 2024 05:05:32 +0000 -Subject: [PATCH 44/85] rasdaemon: add mem_fail_event trigger - -This event is somewhat similar to mc_event, except that this one -occurs on ARM platforms and the fields are different. - -Signed-off-by: Mauro Carvalho Chehab ---- - contrib/mem_fail_trigger.sh | 12 ++++++++ - ras-events.c | 1 + - ras-memory-failure-handler.c | 56 ++++++++++++++++++++++++++++++++++++ - ras-memory-failure-handler.h | 1 + - 4 files changed, 70 insertions(+) - create mode 100755 contrib/mem_fail_trigger.sh - -diff --git a/contrib/mem_fail_trigger.sh b/contrib/mem_fail_trigger.sh -new file mode 100755 -index 0000000..a3ac362 ---- /dev/null -+++ b/contrib/mem_fail_trigger.sh -@@ -0,0 +1,12 @@ -+#!/bin/sh -+# This shell script can be executed by rasdaemon in daemon mode when a -+# memory_failure_event is occured, environment variables include all -+# information reported by tracepoint. -+# -+ -+echo TIMESTAMP: $TIMESTAMP -+echo PFN: $PFN -+echo PAGE_TYPE: $PAGE_TYPE -+echo ACTION_RESULT: $ACTION_RESULT -+ -+exit 0 -diff --git a/ras-events.c b/ras-events.c -index c2eb8f0..e1326f3 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -62,6 +62,7 @@ extern char* choices_disable; - - static const struct event_trigger event_triggers[] = { - { "mc_event", &mc_event_trigger_setup }, -+ { "memory_failure_event", &mem_fail_event_trigger_setup }, - }; - - static int get_debugfs_dir(char *tracing_dir, size_t len) -diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c -index 4798ead..d6e83a9 100644 ---- a/ras-memory-failure-handler.c -+++ b/ras-memory-failure-handler.c -@@ -12,6 +12,7 @@ - * GNU General Public License for more details. - */ - -+#include - #include - #include - #include -@@ -19,6 +20,7 @@ - #include "ras-record.h" - #include "ras-logger.h" - #include "ras-report.h" -+#include "trigger.h" - - /* Memory failure - various types of pages */ - enum mf_action_page_type { -@@ -90,6 +92,59 @@ static const struct { - { MF_RECOVERED, "Recovered" }, - }; - -+#define MAX_ENV 6 -+static const char *mf_trigger = NULL; -+ -+void mem_fail_event_trigger_setup(void) -+{ -+ const char *trigger; -+ -+ trigger = getenv("MEM_FAIL_TRIGGER"); -+ if (trigger && strcmp(trigger, "")) { -+ mf_trigger = trigger_check(trigger); -+ -+ if (!mf_trigger) { -+ log(ALL, LOG_ERR, -+ "Cannot access memory_fail_event trigger `%s`\n", -+ trigger); -+ } else { -+ log(ALL, LOG_INFO, -+ "Setup memory_fail_event trigger `%s`\n", -+ trigger); -+ } -+ } -+} -+ -+static void run_mf_trigger(struct ras_mf_event *ev) -+{ -+ char *env[MAX_ENV]; -+ int ei = 0; -+ int i; -+ -+ if (!mf_trigger) -+ return; -+ -+ if (asprintf(&env[ei++], "PATH=%s", getenv("PATH") ?: "/sbin:/usr/sbin:/bin:/usr/bin") < 0) -+ goto free; -+ if (asprintf(&env[ei++], "TIMESTAMP=%s", ev->timestamp) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "PFN=%s", ev->pfn) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "PAGE_TYPE=%s", ev->page_type) < 0) -+ goto free; -+ if (asprintf(&env[ei++], "ACTION_RESULT=%s", ev->action_result) < 0) -+ goto free; -+ -+ env[ei] = NULL; -+ assert(ei < MAX_ENV); -+ -+ run_trigger(mf_trigger, NULL, env, "memory_fail_event"); -+ -+free: -+ for (i = 0; i < ei; i++) -+ free(env[i]); -+} -+ - static const char *get_page_type(int page_type) - { - unsigned int i; -@@ -169,6 +224,7 @@ int ras_memory_failure_event_handler(struct trace_seq *s, - /* Report event to ABRT */ - ras_report_mf_event(ras, &ev); - #endif -+ run_mf_trigger(&ev); - - return 0; - } -diff --git a/ras-memory-failure-handler.h b/ras-memory-failure-handler.h -index b9e9971..30d8e9d 100644 ---- a/ras-memory-failure-handler.h -+++ b/ras-memory-failure-handler.h -@@ -18,6 +18,7 @@ - #include "ras-events.h" - #include "libtrace/event-parse.h" - -+void mem_fail_event_trigger_setup(void); - int ras_memory_failure_event_handler(struct trace_seq *s, - struct pevent_record *record, - struct event_format *event, void *context); --- -2.33.1 - diff --git a/1045-rasdaemon-ras-report-fix-possible-but-unlikely-file-.patch b/1045-rasdaemon-ras-report-fix-possible-but-unlikely-file-.patch deleted file mode 100644 index b9ed013f5cecc94e8a84ccf083c124032323db0c..0000000000000000000000000000000000000000 --- a/1045-rasdaemon-ras-report-fix-possible-but-unlikely-file-.patch +++ /dev/null @@ -1,93 +0,0 @@ -From 1c31e9948a2c19ecb0f39d5f14910c80316ac625 Mon Sep 17 00:00:00 2001 -From: Aristeu Rozanski -Date: Thu, 19 Jan 2023 08:45:57 -0500 -Subject: [PATCH 45/85] rasdaemon: ras-report: fix possible but unlikely file - descriptor leak - -Found with covscan. - -Signed-off-by: Aristeu Rozanski -Signed-off-by: Mauro Carvalho Chehab ---- - ras-report.c | 16 ++++++++-------- - 1 file changed, 8 insertions(+), 8 deletions(-) - -diff --git a/ras-report.c b/ras-report.c -index ea3a9b6..62d5eb7 100644 ---- a/ras-report.c -+++ b/ras-report.c -@@ -434,7 +434,7 @@ int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev){ - - mc_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -484,7 +484,7 @@ int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev){ - - aer_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -533,7 +533,7 @@ int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standar - - non_standard_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -578,7 +578,7 @@ int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev){ - - arm_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -624,7 +624,7 @@ int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){ - - mce_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -674,7 +674,7 @@ int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev){ - - devlink_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -723,7 +723,7 @@ int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *e - done = 1; - - diskerror_fail: -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -768,7 +768,7 @@ int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) - done = 1; - - mf_fail: -- if (sockfd > 0) -+ if (sockfd >= 0) - close(sockfd); - - if (done) --- -2.33.1 - diff --git a/1046-ras-events-quit-loop-in-read_ras_event-when-kbuf-dat.patch b/1046-ras-events-quit-loop-in-read_ras_event-when-kbuf-dat.patch deleted file mode 100644 index 9988e4eafb74702cc7557eb78a20092f640cf111..0000000000000000000000000000000000000000 --- a/1046-ras-events-quit-loop-in-read_ras_event-when-kbuf-dat.patch +++ /dev/null @@ -1,41 +0,0 @@ -From 838234ed7e9cf87b740556c5e15e3e236b723fa4 Mon Sep 17 00:00:00 2001 -From: hubin -Date: Thu, 18 May 2023 16:14:41 +0800 -Subject: [PATCH 46/85] ras-events: quit loop in read_ras_event when kbuf data - is broken - -when kbuf data is broken, kbuffer_next_event() may move kbuf->index back to -the current kbuf->index position, causing dead loop. - -In this situation, rasdaemon will repeatedly parse an invalid event, and -print warning like "ug! negative record size -8!", pushing cpu utilization -rate to 100%. - -when kbuf data is broken, discard current page and continue reading next page -kbuf. - -Signed-off-by: hubin -Signed-off-by: Mauro Carvalho Chehab ---- - ras-events.c | 5 +++++ - 1 file changed, 5 insertions(+) - -diff --git a/ras-events.c b/ras-events.c -index e1326f3..2cc54b3 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -546,6 +546,11 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, - kbuffer_load_subbuffer(kbuf, page); - - while ((data = kbuffer_read_event(kbuf, &time_stamp))) { -+ if (kbuffer_curr_size(kbuf) < 0) { -+ log(TERM, LOG_ERR, "invalid kbuf data, discard\n"); -+ break; -+ } -+ - parse_ras_data(&pdata[i], - kbuf, data, time_stamp); - --- -2.33.1 - diff --git a/1047-C-files-cleanup-coding-style.patch b/1047-C-files-cleanup-coding-style.patch deleted file mode 100644 index 913e9ad0a9b9ca99055da7407d637b79662344c4..0000000000000000000000000000000000000000 --- a/1047-C-files-cleanup-coding-style.patch +++ /dev/null @@ -1,3343 +0,0 @@ -From 531a16aa949221932f76108bf9c74fe164fec7df Mon Sep 17 00:00:00 2001 -From: Mauro Carvalho Chehab -Date: Mon, 22 Jan 2024 08:36:47 +0100 -Subject: [PATCH 47/85] C files: cleanup coding style - -The rasdaemon conding style follows Linux Kernel where it makes sense. - -Yet, changes made overtime ended with some coding style non-compliances. - -Adjust rasdaemon coding style by using: - - scripts/checkpatch.pl --fix-inplace --strict *.c --ignore PREFER_KERNEL_TYPES - -And doing some manual fixups where the script didn't work. -As a bonus, some typos were also fixed on some rasdaemon messages. - -Signed-off-by: Mauro Carvalho Chehab ---- - bitfield.c | 13 ++- - mce-amd-k8.c | 28 +++-- - mce-amd-smca.c | 17 ++- - mce-amd.c | 6 +- - mce-intel-broadwell-de.c | 6 +- - mce-intel-broadwell-epex.c | 8 +- - mce-intel-dunnington.c | 2 + - mce-intel-haswell.c | 9 +- - mce-intel-ivb.c | 8 +- - mce-intel-knl.c | 2 +- - mce-intel-nehalem.c | 5 +- - mce-intel-p4-p6.c | 10 +- - mce-intel-sb.c | 8 +- - mce-intel-skylake-xeon.c | 6 +- - mce-intel-tulsa.c | 1 + - mce-intel.c | 47 ++++---- - non-standard-ampere.c | 134 +++++++++++----------- - non-standard-hisi_hip08.c | 4 +- - non-standard-hisilicon.c | 18 +-- - non-standard-yitian.c | 28 ++--- - queue.c | 14 +-- - ras-aer-handler.c | 4 +- - ras-arm-handler.c | 6 +- - ras-cpu-isolation.c | 28 ++--- - ras-devlink-handler.c | 5 +- - ras-diskerror-handler.c | 3 +- - ras-events.c | 36 +++--- - ras-extlog-handler.c | 11 +- - ras-mc-handler.c | 10 +- - ras-mce-handler.c | 26 ++--- - ras-memory-failure-handler.c | 3 +- - ras-non-standard-handler.c | 11 +- - ras-page-isolation.c | 21 ++-- - ras-record.c | 196 ++++++++++++++++----------------- - ras-report.c | 208 +++++++++++++++++++---------------- - rasdaemon.c | 7 +- - rbtree.c | 32 +++--- - 37 files changed, 504 insertions(+), 477 deletions(-) - -diff --git a/bitfield.c b/bitfield.c -index f004755..2076683 100644 ---- a/bitfield.c -+++ b/bitfield.c -@@ -25,10 +25,10 @@ - #include "ras-mce-handler.h" - #include "bitfield.h" - --unsigned bitfield_msg(char *buf, size_t len, const char **bitarray, -- unsigned array_len, -- unsigned bit_offset, unsigned ignore_bits, -- uint64_t status) -+unsigned int bitfield_msg(char *buf, size_t len, const char **bitarray, -+ unsigned int array_len, -+ unsigned int bit_offset, unsigned int ignore_bits, -+ uint64_t status) - { - int i, n; - char *p = buf; -@@ -64,6 +64,7 @@ unsigned bitfield_msg(char *buf, size_t len, const char **bitarray, - static uint64_t bitmask(uint64_t i) - { - uint64_t mask = 1; -+ - while (mask < i) - mask = (mask << 1) | 1; - return mask; -@@ -77,6 +78,7 @@ void decode_bitfield(struct mce_event *e, uint64_t status, - for (f = fields; f->str; f++) { - uint64_t v = (status >> f->start_bit) & bitmask(f->stringlen - 1); - char *s = NULL; -+ - if (v < f->stringlen) - s = f->str[v]; - if (!s) { -@@ -93,11 +95,14 @@ void decode_numfield(struct mce_event *e, uint64_t status, - struct numfield *fields) - { - struct numfield *f; -+ - for (f = fields; f->name; f++) { - uint64_t mask = (1ULL << (f->end - f->start + 1)) - 1; - uint64_t v = (status >> f->start) & mask; -+ - if (v > 0 || f->force) { - char fmt[32] = {0}; -+ - snprintf(fmt, 32, "%%s: %s\n", f->fmt ? f->fmt : "%Lu"); - mce_snprintf(e->error_msg, fmt, f->name, v); - } -diff --git a/mce-amd-k8.c b/mce-amd-k8.c -index dc772c2..f27b823 100644 ---- a/mce-amd-k8.c -+++ b/mce-amd-k8.c -@@ -43,7 +43,7 @@ static const char *k8bank[] = { - }; - - static const char *k8threshold[] = { -- [0 ... K8_MCELOG_THRESHOLD_DRAM_ECC - 1] = "Unknow threshold counter", -+ [0 ... K8_MCELOG_THRESHOLD_DRAM_ECC - 1] = "Unknown threshold counter", - [K8_MCELOG_THRESHOLD_DRAM_ECC] = "MC4_MISC0 DRAM threshold", - [K8_MCELOG_THRESHOLD_LINK] = "MC4_MISC1 Link threshold", - [K8_MCELOG_THRESHOLD_L3_CACHE] = "MC4_MISC2 L3 Cache threshold", -@@ -56,25 +56,31 @@ static const char *k8threshold[] = { - static const char *transaction[] = { - "instruction", "data", "generic", "reserved" - }; -+ - static const char *cachelevel[] = { - "0", "1", "2", "generic" - }; -+ - static const char *memtrans[] = { - "generic error", "generic read", "generic write", "data read", - "data write", "instruction fetch", "prefetch", "evict", "snoop", - "?", "?", "?", "?", "?", "?", "?" - }; -+ - static const char *partproc[] = { - "local node origin", "local node response", - "local node observed", "generic participation" - }; -+ - static const char *timeout[] = { - "request didn't time out", - "request timed out" - }; -+ - static const char *memoryio[] = { - "memory", "res.", "i/o", "generic" - }; -+ - static const char *nbextendederr[] = { - "RAM ECC error", - "CRC error", -@@ -96,6 +102,7 @@ static const char *nbextendederr[] = { - "L3 Cache Tag Error", - "L3 Cache LRU Error" - }; -+ - static const char *highbits[32] = { - [31] = "valid", - [30] = "error overflow (multiple errors)", -@@ -164,7 +171,7 @@ static void decode_k8_dc_mc(struct mce_event *e) - if (e->status & (3ULL << 45)) { - mce_snprintf(e->error_msg, - "Data cache ECC error (syndrome %x)", -- (uint32_t) (e->status >> 47) & 0xff); -+ (uint32_t)(e->status >> 47) & 0xff); - if (e->status & (1ULL << 40)) - mce_snprintf(e->error_msg, "found by scrubber"); - } -@@ -185,7 +192,7 @@ static void decode_k8_ic_mc(struct mce_event *e) - - if ((errcode & 0xfff0) == 0x0010) - mce_snprintf(e->error_msg, "TLB parity error in %s array", -- (exterrcode == 0) ? "physical" : "virtual"); -+ (exterrcode == 0) ? "physical" : "virtual"); - } - - static void decode_k8_bu_mc(struct mce_event *e) -@@ -196,10 +203,10 @@ static void decode_k8_bu_mc(struct mce_event *e) - mce_snprintf(e->error_msg, "L2 cache ECC error"); - - mce_snprintf(e->error_msg, "%s array error", -- !exterrcode ? "Bus or cache" : "Cache tag"); -+ !exterrcode ? "Bus or cache" : "Cache tag"); - } - --static void decode_k8_nb_mc(struct mce_event *e, unsigned *memerr) -+static void decode_k8_nb_mc(struct mce_event *e, unsigned int *memerr) - { - unsigned short exterrcode = (e->status >> 16) & 0x0f; - -@@ -209,13 +216,13 @@ static void decode_k8_nb_mc(struct mce_event *e, unsigned *memerr) - case 0: - *memerr = 1; - mce_snprintf(e->error_msg, "ECC syndrome = %x", -- (uint32_t) (e->status >> 47) & 0xff); -+ (uint32_t)(e->status >> 47) & 0xff); - break; - case 8: - *memerr = 1; - mce_snprintf(e->error_msg, "Chipkill ECC syndrome = %x", -- (uint32_t) ((((e->status >> 24) & 0xff) << 8) -- | ((e->status >> 47) & 0xff))); -+ (uint32_t)((((e->status >> 24) & 0xff) << 8) -+ | ((e->status >> 47) & 0xff))); - break; - case 1: - case 2: -@@ -223,7 +230,7 @@ static void decode_k8_nb_mc(struct mce_event *e, unsigned *memerr) - case 4: - case 6: - mce_snprintf(e->error_msg, "link number = %x", -- (uint32_t) (e->status >> 36) & 0xf); -+ (uint32_t)(e->status >> 36) & 0xf); - break; - } - } -@@ -251,11 +258,12 @@ static void bank_name(struct mce_event *e) - - int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e) - { -- unsigned ismemerr = 0; -+ unsigned int ismemerr = 0; - - /* Don't handle GART errors */ - if (e->bank == 4) { - unsigned short exterrcode = (e->status >> 16) & 0x0f; -+ - if (exterrcode == 5 && (e->status & (1ULL << 61))) { - return -1; - } -diff --git a/mce-amd-smca.c b/mce-amd-smca.c -index 3fd97e0..c66a5f7 100644 ---- a/mce-amd-smca.c -+++ b/mce-amd-smca.c -@@ -74,10 +74,10 @@ enum smca_bank_types { - SMCA_PCIE, /* PCI Express Unit */ - SMCA_PCIE_V2, - SMCA_XGMI_PCS, /* xGMI PCS Unit */ -- SMCA_NBIF, /*NBIF Unit */ -- SMCA_SHUB, /* System Hub Unit */ -- SMCA_SATA, /* SATA Unit */ -- SMCA_USB, /* USB Unit */ -+ SMCA_NBIF, /* NBIF Unit */ -+ SMCA_SHUB, /* System Hub Unit */ -+ SMCA_SATA, /* SATA Unit */ -+ SMCA_USB, /* USB Unit */ - SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */ - SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */ - SMCA_GMI_PCS, /* GMI PCS Unit */ -@@ -833,7 +833,7 @@ static struct smca_bank_name smca_names[] = { - [SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" }, - [SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" }, - [SMCA_MP5] = { "Microprocessor 5 Unit" }, -- [SMCA_MPDMA] = { "MPDMA Unit" }, -+ [SMCA_MPDMA] = { "MPDMA Unit" }, - [SMCA_NBIO] = { "Northbridge IO Unit" }, - [SMCA_PCIE ... SMCA_PCIE_V2] = { "PCI Express Unit" }, - [SMCA_XGMI_PCS] = { "Ext Global Memory Interconnect PCS Unit" }, -@@ -851,7 +851,6 @@ static struct smca_bank_name smca_names[] = { - - void amd_decode_errcode(struct mce_event *e) - { -- - decode_amd_errcode(e); - - if (e->status & MCI_STATUS_POISON) -@@ -859,8 +858,8 @@ void amd_decode_errcode(struct mce_event *e) - - if (e->status & MCI_STATUS_TCC) - mce_snprintf(e->mcistatus_msg, "Task_context_corrupt"); -- - } -+ - /* - * To find the UMC channel represented by this bank we need to match on its - * instance_id. The instance_id of a bank is held in the lower 32 bits of its -@@ -890,7 +889,7 @@ static int find_hbm_channel(struct mce_event *e) - return (umc % 2) ? tmp + 4 : tmp; - } - --static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype) -+static inline void fixup_hwid(struct mce_priv *m, uint32_t *hwid_mcatype) - { - if (m->family == 0x19) { - switch (m->model) { -@@ -1006,7 +1005,7 @@ int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e) - uint64_t mcgstatus = e->mcgstatus; - - mce_snprintf(e->mcgstatus_msg, "mcgstatus=%lld", -- (long long)e->mcgstatus); -+ (long long)e->mcgstatus); - - if (mcgstatus & MCG_STATUS_RIPV) - mce_snprintf(e->mcgstatus_msg, "RIPV"); -diff --git a/mce-amd.c b/mce-amd.c -index 116df11..ac0dbac 100644 ---- a/mce-amd.c -+++ b/mce-amd.c -@@ -26,26 +26,31 @@ - static char *transaction[] = { - "instruction", "data", "generic", "reserved" - }; -+ - /* Error codes: cache level (LL) */ - static char *cachelevel[] = { - "reserved", "L1", "L2", "L3/generic" - }; -+ - /* Error codes: memory transaction type (RRRR) */ - static char *memtrans[] = { - "generic", "generic read", "generic write", "data read", - "data write", "instruction fetch", "prefetch", "evict", "snoop", - "?", "?", "?", "?", "?", "?", "?" - }; -+ - /* Participation Processor */ - static char *partproc[] = { - "local node origin", "local node response", - "local node observed", "generic participation" - }; -+ - /* Timeout */ - static char *timeout[] = { - "request didn't time out", - "request timed out" - }; -+ - /* internal unclassified error code */ - static char *internal[] = { "reserved", - "reserved", -@@ -118,5 +123,4 @@ void decode_amd_errcode(struct mce_event *e) - PP_MSG(ec), TO_MSG(ec), - R4_MSG(ec), LL_MSG(ec)); - return; -- - } -diff --git a/mce-intel-broadwell-de.c b/mce-intel-broadwell-de.c -index d52c82e..8210782 100644 ---- a/mce-intel-broadwell-de.c -+++ b/mce-intel-broadwell-de.c -@@ -78,7 +78,7 @@ void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e) - { - uint64_t status = e->status; - uint32_t mca = status & 0xffff; -- unsigned rank0 = -1, rank1 = -1, chan; -+ unsigned int rank0 = -1, rank1 = -1, chan; - - switch (e->bank) { - case 4: -@@ -115,7 +115,7 @@ void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e) - - /* Ignore unless this is an corrected extended error from an iMC bank */ - if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) || -- !test_prefix(7, status & 0xefff)) -+ !test_prefix(7, status & 0xefff)) - return; - - /* -@@ -140,7 +140,7 @@ void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e) - */ - if (rank0 != -1 && rank1 != -1) - mce_snprintf(e->mc_location, "ranks=%d and %d", -- rank0, rank1); -+ rank0, rank1); - else if (rank0 != -1) - mce_snprintf(e->mc_location, "rank=%d", rank0); - } -diff --git a/mce-intel-broadwell-epex.c b/mce-intel-broadwell-epex.c -index f7cd3b6..9c863d9 100644 ---- a/mce-intel-broadwell-epex.c -+++ b/mce-intel-broadwell-epex.c -@@ -91,7 +91,7 @@ static char *qpi[] = { - [0x22] = "Phy detected in-band reset (no width change)", - [0x23] = "Link failover clock failover", - [0x30] = "Rx detected CRC error - successful LLR after Phy re-init", -- [0x31] = "Rx detected CRC error - successful LLR wihout Phy re-init", -+ [0x31] = "Rx detected CRC error - successful LLR without Phy re-init", - }; - - static struct field qpi_mc[] = { -@@ -118,7 +118,7 @@ void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e) - { - uint64_t status = e->status; - uint32_t mca = status & 0xffff; -- unsigned rank0 = -1, rank1 = -1, chan; -+ unsigned int rank0 = -1, rank1 = -1, chan; - - switch (e->bank) { - case 4: -@@ -160,7 +160,7 @@ void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e) - - /* Ignore unless this is an corrected extended error from an iMC bank */ - if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) || -- !test_prefix(7, status & 0xefff)) -+ !test_prefix(7, status & 0xefff)) - return; - - /* -@@ -185,7 +185,7 @@ void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e) - */ - if (rank0 != -1 && rank1 != -1) - mce_snprintf(e->mc_location, "ranks=%d and %d", -- rank0, rank1); -+ rank0, rank1); - else if (rank0 != -1) - mce_snprintf(e->mc_location, "rank=%d", rank0); - } -diff --git a/mce-intel-dunnington.c b/mce-intel-dunnington.c -index c695c62..71c3fb4 100644 ---- a/mce-intel-dunnington.c -+++ b/mce-intel-dunnington.c -@@ -91,6 +91,7 @@ static void dunnington_decode_bus(struct mce_event *e, uint64_t status) - static void dunnington_decode_internal(struct mce_event *e, uint64_t status) - { - uint32_t mca = (status >> 16) & 0xffff; -+ - if ((mca & 0xfff0) == 0) - decode_bitfield(e, mca, dnt_front_status); - else if ((mca & 0xf0ff) == 0) -@@ -104,6 +105,7 @@ static void dunnington_decode_internal(struct mce_event *e, uint64_t status) - void dunnington_decode_model(struct mce_event *e) - { - uint64_t status = e->status; -+ - if ((status & 0xffff) == 0xe0f) - dunnington_decode_bus(e, status); - else if ((status & 0xffff) == (1 << 10)) -diff --git a/mce-intel-haswell.c b/mce-intel-haswell.c -index 1791a36..195f6ed 100644 ---- a/mce-intel-haswell.c -+++ b/mce-intel-haswell.c -@@ -23,7 +23,6 @@ - #include "ras-mce-handler.h" - #include "bitfield.h" - -- - /* See IA32 SDM Vol3B Table 16-20 */ - - static char *pcu_1[] = { -@@ -92,7 +91,7 @@ static char *qpi[] = { - [0x22] = "Phy detected in-band reset (no width change)", - [0x23] = "Link failover clock failover", - [0x30] = "Rx detected CRC error - successful LLR after Phy re-init", -- [0x31] = "Rx detected CRC error - successful LLR wihout Phy re-init", -+ [0x31] = "Rx detected CRC error - successful LLR without Phy re-init", - }; - - static struct field qpi_mc[] = { -@@ -120,7 +119,7 @@ void hsw_decode_model(struct ras_events *ras, struct mce_event *e) - { - uint64_t status = e->status; - uint32_t mca = status & 0xffff; -- unsigned rank0 = -1, rank1 = -1, chan; -+ unsigned int rank0 = -1, rank1 = -1, chan; - - switch (e->bank) { - case 4: -@@ -160,7 +159,7 @@ void hsw_decode_model(struct ras_events *ras, struct mce_event *e) - - /* Ignore unless this is an corrected extended error from an iMC bank */ - if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) || -- !test_prefix(7, status & 0xefff)) -+ !test_prefix(7, status & 0xefff)) - return; - - /* -@@ -185,7 +184,7 @@ void hsw_decode_model(struct ras_events *ras, struct mce_event *e) - */ - if (rank0 != -1 && rank1 != -1) - mce_snprintf(e->mc_location, "ranks=%d and %d", -- rank0, rank1); -+ rank0, rank1); - else if (rank0 != -1) - mce_snprintf(e->mc_location, "rank=%d", rank0); - } -diff --git a/mce-intel-ivb.c b/mce-intel-ivb.c -index 0c5bebc..e5e6a7a 100644 ---- a/mce-intel-ivb.c -+++ b/mce-intel-ivb.c -@@ -90,7 +90,7 @@ void ivb_decode_model(struct ras_events *ras, struct mce_event *e) - struct mce_priv *mce = ras->mce_priv; - uint64_t status = e->status; - uint32_t mca = status & 0xffff; -- unsigned rank0 = -1, rank1 = -1, chan; -+ unsigned int rank0 = -1, rank1 = -1, chan; - - switch (e->bank) { - case 4: -@@ -121,7 +121,7 @@ void ivb_decode_model(struct ras_events *ras, struct mce_event *e) - - /* Ignore unless this is an corrected extended error from an iMC bank */ - if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) || -- !test_prefix(7, status & 0xefff)) -+ !test_prefix(7, status & 0xefff)) - return; - - /* -@@ -146,7 +146,7 @@ void ivb_decode_model(struct ras_events *ras, struct mce_event *e) - */ - if (rank0 >= 0 && rank1 >= 0) - mce_snprintf(e->mc_location, "ranks=%d and %d", -- rank0, rank1); -+ rank0, rank1); - else if (rank0 >= 0) - mce_snprintf(e->mc_location, "rank=%d", rank0); - else -@@ -162,7 +162,7 @@ void ivb_decode_model(struct ras_events *ras, struct mce_event *e) - * faling rank to a DIMM slot. - */ - #if 0 --static int failrank2dimm(unsigned failrank, int socket, int channel) -+static int failrank2dimm(unsigned int failrank, int socket, int channel) - { - switch (failrank) { - case 0: case 1: case 2: case 3: -diff --git a/mce-intel-knl.c b/mce-intel-knl.c -index 7062fbb..0e2ea80 100644 ---- a/mce-intel-knl.c -+++ b/mce-intel-knl.c -@@ -36,7 +36,7 @@ void knl_decode_model(struct ras_events *ras, struct mce_event *e) - { - uint64_t status = e->status; - uint32_t mca = status & 0xffff; -- unsigned rank0 = -1, rank1 = -1, chan = 0; -+ unsigned int rank0 = -1, rank1 = -1, chan = 0; - - switch (e->bank) { - case 5: -diff --git a/mce-intel-nehalem.c b/mce-intel-nehalem.c -index ad4ce69..18992d1 100644 ---- a/mce-intel-nehalem.c -+++ b/mce-intel-nehalem.c -@@ -112,9 +112,9 @@ void nehalem_decode_model(struct mce_event *e) - uint64_t status = e->status; - uint32_t mca = status & 0xffff; - uint64_t misc = e->misc; -- unsigned channel, dimm; -+ unsigned int channel, dimm; - -- if ((mca >> 11) == 1) { /* bus and interconnect QPI */ -+ if ((mca >> 11) == 1) { /* bus and interconnect QPI */ - decode_bitfield(e, status, qpi_status); - if (status & MCI_STATUS_MISCV) { - decode_numfield(e, misc, qpi_numbers); -@@ -143,6 +143,7 @@ void xeon75xx_decode_model(struct mce_event *e) - { - uint64_t status = e->status; - uint32_t mca = status & 0xffff; -+ - if (mca == 0x0001) { /* internal unspecified */ - decode_bitfield(e, status, internal_error_status); - decode_numfield(e, status, internal_error_numbers); -diff --git a/mce-intel-p4-p6.c b/mce-intel-p4-p6.c -index 5c6c3ff..2751d34 100644 ---- a/mce-intel-p4-p6.c -+++ b/mce-intel-p4-p6.c -@@ -66,8 +66,8 @@ static struct field p6_shared_status[] = { - FIELD(25, bus_queue_error_type), - SBITFIELD(30, "internal BINIT"), - SBITFIELD(36, "received parity error on response transaction"), -- SBITFIELD(38, "timeout BINIT (ROB timeout)." -- " No micro-instruction retired for some time"), -+ SBITFIELD(38, -+ "timeout BINIT (ROB timeout). No micro-instruction retired for some time"), - FIELD_NULL(39), - SBITFIELD(42, "bus transaction received hard error response"), - SBITFIELD(43, "failure that caused IERR"), -@@ -86,7 +86,7 @@ static struct field p6old_status[] = { - FIELD_NULL(31), - FIELD_NULL(32), - SBITFIELD(35, "BINIT received from external bus"), -- SBITFIELD(37, "Received hard error reponse on split transaction (Bus BINIT)"), -+ SBITFIELD(37, "Received hard error response on split transaction (Bus BINIT)"), - {} - }; - -@@ -109,7 +109,7 @@ static struct numfield p6old_status_numbers[] = { - static struct { - int value; - char *str; --} p4_model []= { -+} p4_model[] = { - {16, "FSB address parity"}, - {17, "Response hard fail"}, - {18, "Response parity"}, -@@ -123,7 +123,7 @@ static struct { - void p4_decode_model(struct mce_event *e) - { - uint32_t model = e->status & 0xffff0000L; -- unsigned i; -+ unsigned int i; - - for (i = 0; i < ARRAY_SIZE(p4_model); i++) { - if (model & (1 << p4_model[i].value)) -diff --git a/mce-intel-sb.c b/mce-intel-sb.c -index e754496..385b125 100644 ---- a/mce-intel-sb.c -+++ b/mce-intel-sb.c -@@ -82,7 +82,7 @@ void snb_decode_model(struct ras_events *ras, struct mce_event *e) - { - struct mce_priv *mce = ras->mce_priv; - uint32_t mca = e->status & 0xffff; -- unsigned rank0 = -1, rank1 = -1, chan; -+ unsigned int rank0 = -1, rank1 = -1, chan; - - switch (e->bank) { - case 4: -@@ -113,7 +113,7 @@ void snb_decode_model(struct ras_events *ras, struct mce_event *e) - - /* Ignore unless this is an corrected extended error from an iMC bank */ - if (e->bank < 8 || e->bank > 11 || (e->status & MCI_STATUS_UC) || -- !test_prefix(7, e->status & 0xefff)) -+ !test_prefix(7, e->status & 0xefff)) - return; - - /* -@@ -138,7 +138,7 @@ void snb_decode_model(struct ras_events *ras, struct mce_event *e) - */ - if (rank0 >= 0 && rank1 >= 0) - mce_snprintf(e->mc_location, "ranks=%d and %d", -- rank0, rank1); -+ rank0, rank1); - else if (rank0 >= 0) - mce_snprintf(e->mc_location, "rank=%d", rank0); - else -@@ -162,7 +162,7 @@ void snb_decode_model(struct ras_events *ras, struct mce_event *e) - * can be converted to a DIMM number within a channel for systems with either - * two or three DIMMs per channel. - */ --static int failrank2dimm(unsigned failrank, int socket, int channel) -+static int failrank2dimm(unsigned int failrank, int socket, int channel) - { - switch (failrank) { - case 0: case 1: case 2: case 3: -diff --git a/mce-intel-skylake-xeon.c b/mce-intel-skylake-xeon.c -index 680578a..37e5e11 100644 ---- a/mce-intel-skylake-xeon.c -+++ b/mce-intel-skylake-xeon.c -@@ -170,7 +170,7 @@ void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e) - { - uint64_t status = e->status; - uint32_t mca = status & 0xffff; -- unsigned rank0 = -1, rank1 = -1, chan; -+ unsigned int rank0 = -1, rank1 = -1, chan; - - switch (e->bank) { - case 4: -@@ -221,7 +221,7 @@ void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e) - - /* Ignore unless this is an corrected extended error from an iMC bank */ - if (e->bank < 13 || e->bank > 18 || (status & MCI_STATUS_UC) || -- !test_prefix(7, status & 0xefff)) -+ !test_prefix(7, status & 0xefff)) - return; - - /* -@@ -246,7 +246,7 @@ void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e) - */ - if (rank0 != -1 && rank1 != -1) - mce_snprintf(e->mc_location, "ranks=%d and %d", -- rank0, rank1); -+ rank0, rank1); - else if (rank0 != -1) - mce_snprintf(e->mc_location, "rank=%d", rank0); - } -diff --git a/mce-intel-tulsa.c b/mce-intel-tulsa.c -index e59bf06..f38d638 100644 ---- a/mce-intel-tulsa.c -+++ b/mce-intel-tulsa.c -@@ -105,6 +105,7 @@ static void tulsa_decode_bus(struct mce_event *e, uint64_t status) - static void tulsa_decode_internal(struct mce_event *e, uint64_t status) - { - uint32_t mca = (status >> 16) & 0xffff; -+ - if ((mca & 0xfff0) == 0) - decode_bitfield(e, mca, tls_front_status); - else if ((mca & 0xf0ff) == 0) -diff --git a/mce-intel.c b/mce-intel.c -index 18a9072..7f48cc4 100644 ---- a/mce-intel.c -+++ b/mce-intel.c -@@ -57,8 +57,7 @@ - #define BUS_PP_MASK 0x600 /*bit 9, bit 10*/ - #define BUS_PP_SHIFT 0x9 - --#define MCG_TES_P (1ULL<<11) /* Yellow bit cache threshold supported */ -- -+#define MCG_TES_P BIT_ULL(11) /* Yellow bit cache threshold supported */ - - static char *TT[] = { - "Instruction", -@@ -76,8 +75,8 @@ static char *LL[] = { - - static struct { - uint8_t value; -- char* str; --} RRRR [] = { -+ char *str; -+} RRRR[] = { - {0, "Generic"}, - {1, "Read"}, - {2, "Write" }, -@@ -121,7 +120,7 @@ static char *mca_msg[] = { - static char *tracking_msg[] = { - [1] = "green", - [2] = "yellow", -- [3] ="res3" -+ [3] = "res3" - }; - - static const char *arstate[4] = { -@@ -157,9 +156,9 @@ static void decode_memory_controller(struct mce_event *e, uint32_t status) - sprintf(channel, "%u", status & 0xf); - - mce_snprintf(e->error_msg, "MEMORY CONTROLLER %s_CHANNEL%s_ERR", -- mmm_mnemonic[(status >> 4) & 7], channel); -+ mmm_mnemonic[(status >> 4) & 7], channel); - mce_snprintf(e->error_msg, "Transaction: %s", -- mmm_desc[(status >> 4) & 7]); -+ mmm_desc[(status >> 4) & 7]); - } - - static void decode_termal_bank(struct mce_event *e) -@@ -207,7 +206,7 @@ static void bank_name(struct mce_event *e) - - static char *get_RRRR_str(uint8_t rrrr) - { -- unsigned i; -+ unsigned int i; - - for (i = 0; i < ARRAY_SIZE(RRRR); i++) { - if (RRRR[i].value == rrrr) { -@@ -220,7 +219,7 @@ static char *get_RRRR_str(uint8_t rrrr) - - #define decode_attr(arr, val) ({ \ - char *__str; \ -- if ((unsigned)(val) >= ARRAY_SIZE(arr)) \ -+ if ((unsigned int)(val) >= ARRAY_SIZE(arr)) \ - __str = "UNKNOWN"; \ - else \ - __str = (arr)[val]; \ -@@ -248,17 +247,17 @@ static void decode_mca(struct mce_event *e, uint64_t track, int *ismemerr) - decode_attr(LL, mca & 3)); - } else if (test_prefix(4, mca)) { - mce_snprintf(e->mcastatus_msg, "%s TLB %s Error", -- decode_attr(TT, (mca & TLB_TT_MASK) >> TLB_TT_SHIFT), -- decode_attr(LL, (mca & TLB_LL_MASK) >> TLB_LL_SHIFT)); -+ decode_attr(TT, (mca & TLB_TT_MASK) >> TLB_TT_SHIFT), -+ decode_attr(LL, (mca & TLB_LL_MASK) >> TLB_LL_SHIFT)); - } else if (test_prefix(8, mca)) { -- unsigned typenum = (mca & CACHE_TT_MASK) >> CACHE_TT_SHIFT; -- unsigned levelnum = (mca & CACHE_LL_MASK) >> CACHE_LL_SHIFT; -+ unsigned int typenum = (mca & CACHE_TT_MASK) >> CACHE_TT_SHIFT; -+ unsigned int levelnum = (mca & CACHE_LL_MASK) >> CACHE_LL_SHIFT; - char *type = decode_attr(TT, typenum); - char *level = decode_attr(LL, levelnum); -+ - mce_snprintf(e->mcastatus_msg, - "%s CACHE %s %s Error", type, level, -- get_RRRR_str((mca & CACHE_RRRR_MASK) >> -- CACHE_RRRR_SHIFT)); -+ get_RRRR_str((mca & CACHE_RRRR_MASK) >> CACHE_RRRR_SHIFT)); - #if 0 - /* FIXME: We shouldn't mix parsing with actions */ - if (track == 2) -@@ -313,15 +312,13 @@ static void decode_mci(struct mce_event *e, int *ismemerr) - else - mce_snprintf(e->mcistatus_msg, "Corrected_error"); - -- - if (e->status & MCI_STATUS_EN) - mce_snprintf(e->mcistatus_msg, "Error_enabled"); - -- - if (e->status & MCI_STATUS_PCC) - mce_snprintf(e->mcistatus_msg, "Processor_context_corrupt"); - -- if (e->status & (MCI_STATUS_S|MCI_STATUS_AR)) -+ if (e->status & (MCI_STATUS_S | MCI_STATUS_AR)) - mce_snprintf(e->mcistatus_msg, "%s", - arstate[(e->status >> 55) & 3]); - -@@ -350,14 +347,14 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e) - - /* Check if the error is at the memory controller */ - if (((e->status & 0xffff) >> 7) == 1) { -- unsigned corr_err_cnt; -+ unsigned int corr_err_cnt; - - corr_err_cnt = EXTRACT(e->status, 38, 52); - mce_snprintf(e->mc_location, "n_errors=%d", corr_err_cnt); - } - - if (test_prefix(11, (e->status & 0xffffL))) { -- switch(mce->cputype) { -+ switch (mce->cputype) { - case CPU_P6OLD: - p6old_decode_model(e); - break; -@@ -375,7 +372,7 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e) - break; - } - } -- switch(mce->cputype) { -+ switch (mce->cputype) { - case CPU_NEHALEM: - nehalem_decode_model(e); - break; -@@ -447,18 +444,18 @@ static int domsr(int cpu, int msr, int bit) - return -EINVAL; - } - } -- if (pread(fd, &data, sizeof data, msr) != sizeof data) { -+ if (pread(fd, &data, sizeof(data), msr) != sizeof(data)) { - log(ALL, LOG_ERR, - "Cannot read MSR_ERROR_CONTROL from %s\n", fpath); - return -EINVAL; - } - data |= bit; -- if (pwrite(fd, &data, sizeof data, msr) != sizeof data) { -+ if (pwrite(fd, &data, sizeof(data), msr) != sizeof(data)) { - log(ALL, LOG_ERR, - "Cannot write MSR_ERROR_CONTROL to %s\n", fpath); - return -EINVAL; - } -- if (pread(fd, &data, sizeof data, msr) != sizeof data) { -+ if (pread(fd, &data, sizeof(data), msr) != sizeof(data)) { - log(ALL, LOG_ERR, - "Cannot re-read MSR_ERROR_CONTROL from %s\n", fpath); - return -EINVAL; -@@ -472,7 +469,7 @@ static int domsr(int cpu, int msr, int bit) - return 0; - } - --int set_intel_imc_log(enum cputype cputype, unsigned ncpus) -+int set_intel_imc_log(enum cputype cputype, unsigned int ncpus) - { - int cpu, msr, bit, rc; - -diff --git a/non-standard-ampere.c b/non-standard-ampere.c -index 05b5252..79e09a0 100644 ---- a/non-standard-ampere.c -+++ b/non-standard-ampere.c -@@ -31,6 +31,7 @@ static const char * const disp_payload0_err_reg_name[] = { - "MISC2:", - "MISC3:", - }; -+ - /*PCIe AER Error Payload Type 1*/ - static const char * const disp_payload1_err_reg_name[] = { - "Error Type:", -@@ -86,7 +87,6 @@ static const char * const err_cpm_sub_type[] = { - "ARMv8 Core 1", - }; - -- - static const char * const err_mcu_sub_type[] = { - "ERR0", - "ERR1", -@@ -155,8 +155,6 @@ static char *err_smmu_sub_type(int etype) - return "unknown error"; - } - -- -- - static const char * const err_pcie_aer_sub_type[] = { - "Root Port", - "Device", -@@ -173,7 +171,6 @@ static char *err_peci_rasdp_sub_type(int etype) - return "unknown error"; - } - -- - static const char * const err_ocm_sub_type[] = { - "ERR0", - "ERR1", -@@ -327,7 +324,7 @@ static const struct amp_ras_type_info amp_payload_error_type[] = { - - /*get the error type name*/ - static const char *oem_type_name(const struct amp_ras_type_info *info, -- uint8_t type_id) -+ uint8_t type_id) - { - const struct amp_ras_type_info *type = &info[0]; - -@@ -350,7 +347,7 @@ static const char *oem_subtype_name(const struct amp_ras_type_info *info, - - if (type->id != type_id) - continue; -- if (type->sub == NULL) -+ if (!type->sub) - return type->name; - if (sub_type_id >= type->sub_num) - return "unknown"; -@@ -477,7 +474,7 @@ static void record_amp_data(struct ras_ns_ev_decoder *ev_decoder, - } - - static int store_amp_err_data(struct ras_ns_ev_decoder *ev_decoder, -- const char *name) -+ const char *name) - { - int rc; - -@@ -502,37 +499,37 @@ static int store_amp_err_data(struct ras_ns_ev_decoder *ev_decoder, - - /*save all Ampere Specific Error Payload type 0 to sqlite3 database*/ - static void record_amp_payload0_err(struct ras_ns_ev_decoder *ev_decoder, -- const char *type_str, const char *subtype_str, -+ const char *type_str, const char *subtype_str, - const struct amp_payload0_type_sec *err) - { -- if (ev_decoder != NULL) { -+ if (ev_decoder) { - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, -- AMP_PAYLOAD0_FIELD_TYPE, 0, type_str); -+ AMP_PAYLOAD0_FIELD_TYPE, 0, type_str); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, -- AMP_PAYLOAD0_FIELD_SUB_TYPE, 0, subtype_str); -+ AMP_PAYLOAD0_FIELD_SUB_TYPE, 0, subtype_str); - - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD0_FIELD_INS, INSTANCE(err->instance), NULL); -+ AMP_PAYLOAD0_FIELD_INS, INSTANCE(err->instance), NULL); - - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD0_FIELD_SOCKET_NUM, -+ AMP_PAYLOAD0_FIELD_SOCKET_NUM, - SOCKET_NUM(err->instance), NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD0_FIELD_STATUS_REG, err->err_status, NULL); -+ AMP_PAYLOAD0_FIELD_STATUS_REG, err->err_status, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, -- AMP_PAYLOAD0_FIELD_ADDR_REG, -+ AMP_PAYLOAD0_FIELD_ADDR_REG, - err->err_addr, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, -- AMP_PAYLOAD0_FIELD_MISC0, -+ AMP_PAYLOAD0_FIELD_MISC0, - err->err_misc_0, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, -- AMP_PAYLOAD0_FIELD_MISC1, -+ AMP_PAYLOAD0_FIELD_MISC1, - err->err_misc_1, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, -- AMP_PAYLOAD0_FIELD_MISC2, -+ AMP_PAYLOAD0_FIELD_MISC2, - err->err_misc_2, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, -- AMP_PAYLOAD0_FIELD_MISC3, -+ AMP_PAYLOAD0_FIELD_MISC3, - err->err_misc_3, NULL); - store_amp_err_data(ev_decoder, "amp_payload0_event_tab"); - } -@@ -540,10 +537,10 @@ static void record_amp_payload0_err(struct ras_ns_ev_decoder *ev_decoder, - - /*save all Ampere Specific Error Payload type 1 to sqlite3 database*/ - static void record_amp_payload1_err(struct ras_ns_ev_decoder *ev_decoder, -- const char *type_str, const char *subtype_str, -+ const char *type_str, const char *subtype_str, - const struct amp_payload1_type_sec *err) - { -- if (ev_decoder != NULL) { -+ if (ev_decoder) { - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, - AMP_PAYLOAD1_FIELD_TYPE, 0, type_str); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, -@@ -590,45 +587,45 @@ static void record_amp_payload1_err(struct ras_ns_ev_decoder *ev_decoder, - - /*save all Ampere Specific Error Payload type 2 to sqlite3 database*/ - static void record_amp_payload2_err(struct ras_ns_ev_decoder *ev_decoder, -- const char *type_str, const char *subtype_str, -- const struct amp_payload2_type_sec *err) -+ const char *type_str, const char *subtype_str, -+ const struct amp_payload2_type_sec *err) - { -- if (ev_decoder != NULL) { -+ if (ev_decoder) { - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, -- AMP_PAYLOAD2_FIELD_TYPE, 0, type_str); -+ AMP_PAYLOAD2_FIELD_TYPE, 0, type_str); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, -- AMP_PAYLOAD2_FIELD_SUB_TYPE, 0, subtype_str); -+ AMP_PAYLOAD2_FIELD_SUB_TYPE, 0, subtype_str); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD2_FIELD_INS, INSTANCE(err->instance), NULL); -+ AMP_PAYLOAD2_FIELD_INS, INSTANCE(err->instance), NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD2_FIELD_SOCKET_NUM, -+ AMP_PAYLOAD2_FIELD_SOCKET_NUM, - SOCKET_NUM(err->instance), NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD2_FIELD_CE_REPORT_REG, -+ AMP_PAYLOAD2_FIELD_CE_REPORT_REG, - err->ce_register, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD2_FIELD_CE_LOACATION, -+ AMP_PAYLOAD2_FIELD_CE_LOACATION, - err->ce_location, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD2_FIELD_CE_ADDR, -+ AMP_PAYLOAD2_FIELD_CE_ADDR, - err->ce_addr, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD2_FIELD_UE_REPORT_REG, -+ AMP_PAYLOAD2_FIELD_UE_REPORT_REG, - err->ue_register, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD2_FIELD_UE_LOCATION, -+ AMP_PAYLOAD2_FIELD_UE_LOCATION, - err->ue_location, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD2_FIELD_UE_ADDR, -+ AMP_PAYLOAD2_FIELD_UE_ADDR, - err->ue_addr, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD2_FIELD_RESERVED1, -+ AMP_PAYLOAD2_FIELD_RESERVED1, - err->reserved1, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, -- AMP_PAYLOAD2_FIELD_RESERVED2, -+ AMP_PAYLOAD2_FIELD_RESERVED2, - err->reserved2, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, -- AMP_PAYLOAD2_FIELD_RESERVED3, -+ AMP_PAYLOAD2_FIELD_RESERVED3, - err->reserved3, NULL); - store_amp_err_data(ev_decoder, "amp_payload2_event_tab"); - } -@@ -636,36 +633,36 @@ static void record_amp_payload2_err(struct ras_ns_ev_decoder *ev_decoder, - - /*save all Ampere Specific Error Payload type 3 to sqlite3 database*/ - static void record_amp_payload3_err(struct ras_ns_ev_decoder *ev_decoder, -- const char *type_str, const char *subtype_str, -+ const char *type_str, const char *subtype_str, - const struct amp_payload3_type_sec *err) - { -- if (ev_decoder != NULL) { -+ if (ev_decoder) { - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, -- AMP_PAYLOAD3_FIELD_TYPE, 0, type_str); -+ AMP_PAYLOAD3_FIELD_TYPE, 0, type_str); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, -- AMP_PAYLOAD3_FIELD_SUB_TYPE, 0, subtype_str); -+ AMP_PAYLOAD3_FIELD_SUB_TYPE, 0, subtype_str); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD3_FIELD_INS, INSTANCE(err->instance), NULL); -+ AMP_PAYLOAD3_FIELD_INS, INSTANCE(err->instance), NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD3_FIELD_SOCKET_NUM, -+ AMP_PAYLOAD3_FIELD_SOCKET_NUM, - SOCKET_NUM(err->instance), NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT, -- AMP_PAYLOAD3_FIELD_FW_SPEC_DATA0, -+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA0, - err->fw_speci_data0, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, -- AMP_PAYLOAD3_FIELD_FW_SPEC_DATA1, -+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA1, - err->fw_speci_data1, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, -- AMP_PAYLOAD3_FIELD_FW_SPEC_DATA2, -+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA2, - err->fw_speci_data2, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, -- AMP_PAYLOAD3_FIELD_FW_SPEC_DATA3, -+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA3, - err->fw_speci_data3, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, -- AMP_PAYLOAD3_FIELD_FW_SPEC_DATA4, -+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA4, - err->fw_speci_data4, NULL); - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64, -- AMP_PAYLOAD3_FIELD_FW_SPEC_DATA5, -+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA5, - err->fw_speci_data5, NULL); - store_amp_err_data(ev_decoder, "amp_payload3_event_tab"); - } -@@ -680,29 +677,29 @@ static void record_amp_data(struct ras_ns_ev_decoder *ev_decoder, - } - - static void record_amp_payload0_err(struct ras_ns_ev_decoder *ev_decoder, -- const char *type_str, const char *subtype_str, -- const struct amp_payload0_type_sec *err) -+ const char *type_str, const char *subtype_str, -+ const struct amp_payload0_type_sec *err) - { - return 0; - } - - static void record_amp_payload1_err(struct ras_ns_ev_decoder *ev_decoder, -- const char *type_str, const char *subtype_str, -- const struct amp_payload1_type_sec *err) -+ const char *type_str, const char *subtype_str, -+ const struct amp_payload1_type_sec *err) - { - return 0; - } - - static void record_amp_payload2_err(struct ras_ns_ev_decoder *ev_decoder, -- const char *type_str, const char *subtype_str, -- const struct amp_payload2_type_sec *err) -+ const char *type_str, const char *subtype_str, -+ const struct amp_payload2_type_sec *err) - { - return 0; - } - - static void record_amp_payload3_err(struct ras_ns_ev_decoder *ev_decoder, -- const char *type_str, const char *subtype_str, -- const struct amp_payload3_type_sec *err) -+ const char *type_str, const char *subtype_str, -+ const struct amp_payload3_type_sec *err) - { - return 0; - } -@@ -716,7 +713,7 @@ static int store_amp_err_data(struct ras_ns_ev_decoder *ev_decoder, char *name) - /*decode ampere specific error payload type 0, the CPU's data is save*/ - /*to sqlite by ras-arm-handler, others are saved by this function.*/ - void decode_amp_payload0_err_regs(struct ras_ns_ev_decoder *ev_decoder, -- struct trace_seq *s, -+ struct trace_seq *s, - const struct amp_payload0_type_sec *err) - { - char buf[AMP_PAYLOAD0_BUF_LEN]; -@@ -803,8 +800,8 @@ void decode_amp_payload0_err_regs(struct ras_ns_ev_decoder *ev_decoder, - - /*decode ampere specific error payload type 1 and save to sqlite db*/ - static void decode_amp_payload1_err_regs(struct ras_ns_ev_decoder *ev_decoder, -- struct trace_seq *s, -- const struct amp_payload1_type_sec *err) -+ struct trace_seq *s, -+ const struct amp_payload1_type_sec *err) - { - char buf[AMP_PAYLOAD0_BUF_LEN]; - char *p = buf; -@@ -887,8 +884,8 @@ static void decode_amp_payload1_err_regs(struct ras_ns_ev_decoder *ev_decoder, - - /*decode ampere specific error payload type 2 and save to sqlite db*/ - static void decode_amp_payload2_err_regs(struct ras_ns_ev_decoder *ev_decoder, -- struct trace_seq *s, -- const struct amp_payload2_type_sec *err) -+ struct trace_seq *s, -+ const struct amp_payload2_type_sec *err) - { - char buf[AMP_PAYLOAD0_BUF_LEN]; - char *p = buf; -@@ -903,7 +900,7 @@ static void decode_amp_payload2_err_regs(struct ras_ns_ev_decoder *ev_decoder, - subtype_str = err_peci_rasdp_sub_type(err->subtype); - else - subtype_str = oem_subtype_name(amp_payload_error_type, -- TYPE(err->type), err->subtype); -+ TYPE(err->type), err->subtype); - //display error type - p += snprintf(p, end - p, " %s", disp_payload2_err_reg_name[i++]); - p += snprintf(p, end - p, " %s\n", type_str); -@@ -972,8 +969,8 @@ static void decode_amp_payload2_err_regs(struct ras_ns_ev_decoder *ev_decoder, - - /*decode ampere specific error payload type 3 and save to sqlite db*/ - static void decode_amp_payload3_err_regs(struct ras_ns_ev_decoder *ev_decoder, -- struct trace_seq *s, -- const struct amp_payload3_type_sec *err) -+ struct trace_seq *s, -+ const struct amp_payload3_type_sec *err) - { - char buf[AMP_PAYLOAD0_BUF_LEN]; - char *p = buf; -@@ -989,7 +986,6 @@ static void decode_amp_payload3_err_regs(struct ras_ns_ev_decoder *ev_decoder, - p += snprintf(p, end - p, " %s", disp_payload3_err_reg_name[i++]); - p += snprintf(p, end - p, " %s\n", type_str); - -- - //display error subtype - p += snprintf(p, end - p, " %s", disp_payload3_err_reg_name[i++]); - p += snprintf(p, end - p, " %s\n", subtype_str); -@@ -1074,13 +1070,13 @@ static int decode_amp_oem_type_error(struct ras_events *ras, - if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, - &db_tab) != SQLITE_OK) { - trace_seq_printf(s, -- "create sql %s fail\n", -- sqlite3_table_list[payload_type]); -+ "create sql %s fail\n", -+ sqlite3_table_list[payload_type]); - return -1; - } - } - record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT, -- id, 0, event->timestamp); -+ id, 0, event->timestamp); - #endif - - if (payload_type == PAYLOAD_TYPE_0) { -diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c -index 0899812..e482a7a 100644 ---- a/non-standard-hisi_hip08.c -+++ b/non-standard-hisi_hip08.c -@@ -453,7 +453,7 @@ static const char *oem_submodule_name(const struct hisi_module_info *info, - if (module->id != module_id) - continue; - -- if (module->sub == NULL) -+ if (!module->sub) - return module->name; - - if (sub_module_id >= module->sub_num) -@@ -675,7 +675,7 @@ static int decode_hip08_oem_type1_error(struct ras_events *ras, - struct ras_non_standard_event *event) - { - const struct hisi_oem_type1_err_sec *err = -- (struct hisi_oem_type1_err_sec*)event->error; -+ (struct hisi_oem_type1_err_sec *)event->error; - - if (err->val_bits == 0) { - trace_seq_printf(s, "%s: no valid error information\n", -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -index 7296d28..e9ea5df 100644 ---- a/non-standard-hisilicon.c -+++ b/non-standard-hisilicon.c -@@ -88,10 +88,10 @@ struct hisi_event { - - #ifdef HAVE_SQLITE3 - void record_vendor_data(struct ras_ns_ev_decoder *ev_decoder, -- enum hisi_oem_data_type data_type, -+ enum hisi_oem_data_type data_type, - int id, int64_t data, const char *text) - { -- if (ev_decoder->stmt_dec_record == NULL) -+ if (!ev_decoder->stmt_dec_record) - return; - - switch (data_type) { -@@ -111,7 +111,7 @@ int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name) - { - int rc; - -- if (ev_decoder->stmt_dec_record == NULL) -+ if (!ev_decoder->stmt_dec_record) - return 0; - - rc = sqlite3_step(ev_decoder->stmt_dec_record); -@@ -171,13 +171,13 @@ static const struct db_table_descriptor hisi_common_section_tab = { - }; - #endif - --static const char* soc_desc[] = { -+static const char *soc_desc[] = { - "Kunpeng916", - "Kunpeng920", - "Kunpeng930", - }; - --static const char* module_name[] = { -+static const char *module_name[] = { - "MN", - "PLL", - "SLLC", -@@ -221,9 +221,9 @@ static const char* module_name[] = { - "HBMC", - }; - --static const char* get_soc_desc(uint8_t soc_id) -+static const char *get_soc_desc(uint8_t soc_id) - { -- if (soc_id >= sizeof(soc_desc)/sizeof(char *)) -+ if (soc_id >= sizeof(soc_desc) / sizeof(char *)) - return "unknown"; - - return soc_desc[soc_id]; -@@ -232,7 +232,7 @@ static const char* get_soc_desc(uint8_t soc_id) - static void decode_module(struct ras_ns_ev_decoder *ev_decoder, - struct hisi_event *event, uint8_t module_id) - { -- if (module_id >= sizeof(module_name)/sizeof(char *)) { -+ if (module_id >= sizeof(module_name) / sizeof(char *)) { - HISI_SNPRINTF(event->error_msg, "module=unknown(id=%hhu) ", module_id); - record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT, - HISI_COMMON_FIELD_MODULE_ID, -@@ -246,7 +246,7 @@ static void decode_module(struct ras_ns_ev_decoder *ev_decoder, - } - - static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder, -- const struct hisi_common_error_section *err, -+ const struct hisi_common_error_section *err, - struct hisi_event *event) - { - HISI_SNPRINTF(event->error_msg, "[ table_version=%hhu", err->version); -diff --git a/non-standard-yitian.c b/non-standard-yitian.c -index 4c30514..daadb95 100644 ---- a/non-standard-yitian.c -+++ b/non-standard-yitian.c -@@ -86,25 +86,25 @@ static const struct db_table_descriptor yitian_ddr_payload_section_tab = { - }; - - int record_yitian_ddr_reg_dump_event(struct ras_ns_ev_decoder *ev_decoder, -- struct ras_yitian_ddr_payload_event *ev) -+ struct ras_yitian_ddr_payload_event *ev) - { - int rc; - struct sqlite3_stmt *stmt = ev_decoder->stmt_dec_record; - - log(TERM, LOG_INFO, "yitian_ddr_reg_dump_event store: %p\n", stmt); - -- sqlite3_bind_text (stmt, 1, ev->timestamp, -1, NULL); -- sqlite3_bind_int64 (stmt, 2, ev->address); -- sqlite3_bind_text (stmt, 3, ev->reg_msg, -1, NULL); -+ sqlite3_bind_text(stmt, 1, ev->timestamp, -1, NULL); -+ sqlite3_bind_int64(stmt, 2, ev->address); -+ sqlite3_bind_text(stmt, 3, ev->reg_msg, -1, NULL); - - rc = sqlite3_step(stmt); - if (rc != SQLITE_OK && rc != SQLITE_DONE) - log(TERM, LOG_ERR, -- "Failed to do yitian_ddr_reg_dump_event step on sqlite: error = %d\n", rc); -+ "Failed to do yitian_ddr_reg_dump_event step on sqlite: error = %d\n", rc); - rc = sqlite3_reset(stmt); - if (rc != SQLITE_OK && rc != SQLITE_DONE) - log(TERM, LOG_ERR, -- "Failed reset yitian_ddr_reg_dump_event on sqlite: error = %d\n", rc); -+ "Failed reset yitian_ddr_reg_dump_event on sqlite: error = %d\n", rc); - log(TERM, LOG_INFO, "register inserted at db\n"); - - return rc; -@@ -112,7 +112,7 @@ int record_yitian_ddr_reg_dump_event(struct ras_ns_ev_decoder *ev_decoder, - #endif - - static const char *oem_type_name(const struct yitian_ras_type_info *info, -- uint8_t type_id) -+ uint8_t type_id) - { - const struct yitian_ras_type_info *type = &info[0]; - -@@ -134,7 +134,7 @@ static const char *oem_subtype_name(const struct yitian_ras_type_info *info, - - if (type->id != type_id) - continue; -- if (type->sub == NULL) -+ if (!type->sub) - return type->name; - if (sub_type_id >= type->sub_num) - return "unknown"; -@@ -144,7 +144,7 @@ static const char *oem_subtype_name(const struct yitian_ras_type_info *info, - } - - void decode_yitian_ddr_payload_err_regs(struct ras_ns_ev_decoder *ev_decoder, -- struct trace_seq *s, -+ struct trace_seq *s, - const struct yitian_ddr_payload_type_sec *err, - struct ras_events *ras) - { -@@ -168,7 +168,7 @@ void decode_yitian_ddr_payload_err_regs(struct ras_ns_ev_decoder *ev_decoder, - tm = localtime(&now); - if (tm) - strftime(ev.timestamp, sizeof(ev.timestamp), -- "%Y-%m-%d %H:%M:%S %z", tm); -+ "%Y-%m-%d %H:%M:%S %z", tm); - //display error type - p += snprintf(p, end - p, " %s", yitian_ddr_payload_err_reg_name[i++]); - p += snprintf(p, end - p, " %s,", type_str); -@@ -204,18 +204,17 @@ void decode_yitian_ddr_payload_err_regs(struct ras_ns_ev_decoder *ev_decoder, - #ifdef HAVE_SQLITE3 - record_yitian_ddr_reg_dump_event(ev_decoder, &ev); - #endif -- - } - - static int add_yitian_common_table(struct ras_events *ras, -- struct ras_ns_ev_decoder *ev_decoder) -+ struct ras_ns_ev_decoder *ev_decoder) - { - #ifdef HAVE_SQLITE3 - if (ras->record_events && !ev_decoder->stmt_dec_record) { - if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record, -- &yitian_ddr_payload_section_tab) != SQLITE_OK) { -+ &yitian_ddr_payload_section_tab) != SQLITE_OK) { - log(TERM, LOG_WARNING, -- "Failed to create sql yitian_ddr_payload_section_tab\n"); -+ "Failed to create sql yitian_ddr_payload_section_tab\n"); - return -1; - } - } -@@ -253,6 +252,7 @@ struct ras_ns_ev_decoder yitian_ns_oem_decoder[] = { - static void __attribute__((constructor)) yitian_ns_init(void) - { - int i; -+ - for (i = 0; i < ARRAY_SIZE(yitian_ns_oem_decoder); i++) - register_ns_ev_decoder(&yitian_ns_oem_decoder[i]); - } -diff --git a/queue.c b/queue.c -index 65b6fb8..a90ed6a 100644 ---- a/queue.c -+++ b/queue.c -@@ -29,7 +29,7 @@ struct link_queue *init_queue(void) - struct link_queue *queue = NULL; - - queue = (struct link_queue *)malloc(sizeof(struct link_queue)); -- if (queue == NULL) { -+ if (!queue) { - log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n"); - return NULL; - } -@@ -43,13 +43,13 @@ struct link_queue *init_queue(void) - - void clear_queue(struct link_queue *queue) - { -- if (queue == NULL) -+ if (!queue) - return; - - struct queue_node *node = queue->head; - struct queue_node *tmp = NULL; - -- while (node != NULL) { -+ while (node) { - tmp = node; - node = node->next; - free(tmp); -@@ -72,7 +72,7 @@ void free_queue(struct link_queue *queue) - void push(struct link_queue *queue, struct queue_node *node) - { - /* there is no element in the queue */ -- if (queue->head == NULL) -+ if (!queue->head) - queue->head = node; - else - queue->tail->next = node; -@@ -85,7 +85,7 @@ int pop(struct link_queue *queue) - { - struct queue_node *tmp = NULL; - -- if (queue == NULL || is_empty(queue)) -+ if (!queue || is_empty(queue)) - return -1; - - tmp = queue->head; -@@ -98,7 +98,7 @@ int pop(struct link_queue *queue) - - struct queue_node *front(struct link_queue *queue) - { -- if (queue == NULL) -+ if (!queue) - return NULL; - - return queue->head; -@@ -109,7 +109,7 @@ struct queue_node *node_create(time_t time, unsigned int value) - struct queue_node *node = NULL; - - node = (struct queue_node *)malloc(sizeof(struct queue_node)); -- if (node != NULL) { -+ if (node) { - node->time = time; - node->value = value; - node->next = NULL; -diff --git a/ras-aer-handler.c b/ras-aer-handler.c -index d6898e0..a867ae4 100644 ---- a/ras-aer-handler.c -+++ b/ras-aer-handler.c -@@ -82,7 +82,7 @@ int ras_aer_event_handler(struct trace_seq *s, - */ - - if (ras->use_uptime) -- now = record->ts/user_hz + ras->uptime_diff; -+ now = record->ts / user_hz + ras->uptime_diff; - else - now = time(NULL); - -@@ -177,7 +177,7 @@ int ras_aer_event_handler(struct trace_seq *s, - sel_data[4] = (((dev & 0x1f) << 3) | (fn & 0x7)); - - sprintf(ipmi_add_sel, -- "ipmitool raw 0x0a 0x44 0x00 0x00 0xc0 0x00 0x00 0x00 0x00 0x3a 0xcd 0x00 0xc0 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x", -+ "ipmitool raw 0x0a 0x44 0x00 0x00 0xc0 0x00 0x00 0x00 0x00 0x3a 0xcd 0x00 0xc0 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x", - sel_data[0], sel_data[1], sel_data[2], sel_data[3], sel_data[4]); - - system(ipmi_add_sel); -diff --git a/ras-arm-handler.c b/ras-arm-handler.c -index 731176d..c7e9fd5 100644 ---- a/ras-arm-handler.c -+++ b/ras-arm-handler.c -@@ -29,7 +29,7 @@ - #define BIT2 2 - - void display_raw_data(struct trace_seq *s, -- const uint8_t *buf, -+ const uint8_t *buf, - uint32_t datalen) - { - int i = 0, line_count = 0; -@@ -72,7 +72,7 @@ static int count_errors(struct ras_arm_event *ev, int sev) - - if (ev->pei_len % err_info_size != 0) { - log(TERM, LOG_ERR, -- "The event data does not match to the ARM Processor Error Information Structure\n"); -+ "The event data does not match to the ARM Processor Error Information Structure\n"); - return num; - } - num_pei = ev->pei_len / err_info_size; -@@ -172,7 +172,7 @@ int ras_arm_event_handler(struct trace_seq *s, - */ - - if (ras->use_uptime) -- now = record->ts/user_hz + ras->uptime_diff; -+ now = record->ts / user_hz + ras->uptime_diff; - else - now = time(NULL); - -diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c -index 90633fd..8af31e9 100644 ---- a/ras-cpu-isolation.c -+++ b/ras-cpu-isolation.c -@@ -120,7 +120,7 @@ static int init_cpu_info(unsigned int cpus) - cpu_infos = (struct cpu_info *)malloc(sizeof(*cpu_infos) * cpus); - if (!cpu_infos) { - log(TERM, LOG_ERR, -- "Failed to allocate memory for cpu infos in %s.\n", __func__); -+ "Failed to allocate memory for cpu infos in %s.\n", __func__); - return -1; - } - -@@ -130,9 +130,9 @@ static int init_cpu_info(unsigned int cpus) - cpu_infos[i].state = get_cpu_status(i); - cpu_infos[i].ce_queue = init_queue(); - -- if (cpu_infos[i].ce_queue == NULL) { -+ if (!cpu_infos[i].ce_queue) { - log(TERM, LOG_ERR, -- "Failed to allocate memory for cpu ce queue in %s.\n", __func__); -+ "Failed to allocate memory for cpu ce queue in %s.\n", __func__); - return -1; - } - } -@@ -147,7 +147,7 @@ static void check_config(struct isolation_param *config) - { - if (config->value > config->limit) { - log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n", -- config->value, config->limit); -+ config->value, config->limit); - config->value = config->limit; - } - } -@@ -173,7 +173,7 @@ static int parse_ul_config(struct isolation_param *config, char *env, unsigned l - for (int i = 0; i < env_size; ++i) { - if (isdigit(env[i])) { - if (*value > ULONG_MAX / DEC_CHECK || -- (*value == ULONG_MAX / DEC_CHECK && env[i] - '0' > LAST_BIT_OF_UL)) { -+ (*value == ULONG_MAX / DEC_CHECK && env[i] - '0' > LAST_BIT_OF_UL)) { - log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX); - return -1; - } -@@ -208,7 +208,7 @@ static void init_config(struct isolation_param *config) - - if (parse_ul_config(config, env, &value) < 0) { - log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %lu.\n", -- config->name, env, config->value); -+ config->name, env, config->value); - return; - } - -@@ -220,7 +220,7 @@ static int check_config_status(void) - { - char *env = getenv("CPU_ISOLATION_ENABLE"); - -- if (env == NULL || strcasecmp(env, "yes")) -+ if (!env || strcasecmp(env, "yes")) - return -1; - - return 0; -@@ -295,12 +295,12 @@ static int do_ce_handler(unsigned int cpu) - cpu_infos[cpu].ce_nums -= tmp; - } - log(TERM, LOG_INFO, -- "Current number of Corrected Errors in cpu%d in the cycle is %lu\n", -+ "Current number of Corrected Errors in cpu%d in the cycle is %lu\n", - cpu, cpu_infos[cpu].ce_nums); - - if (cpu_infos[cpu].ce_nums >= threshold.value) { - log(TERM, LOG_INFO, -- "Corrected Errors exceeded threshold %lu, try to offline cpu%u\n", -+ "Corrected Errors exceeded threshold %lu, try to offline cpu%u\n", - threshold.value, cpu); - return do_cpu_offline(cpu); - } -@@ -341,7 +341,7 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info) - { - struct queue_node *node = node_create(err_info->time, err_info->nums); - -- if (node == NULL) { -+ if (!node) { - log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n"); - return; - } -@@ -366,7 +366,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) - - if (cpu >= ncores || cpu < 0) { - log(TERM, LOG_ERR, -- "The current cpu %d has exceed the total number of cpu:%u\n", cpu, ncores); -+ "The current cpu %d has exceed the total number of cpu:%u\n", cpu, ncores); - return; - } - -@@ -385,7 +385,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) - */ - if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) { - log(TERM, LOG_WARNING, -- "Offlined cpus have exceeded limit: %lu, choose to do nothing\n", -+ "Offlined cpus have exceeded limit: %lu, choose to do nothing\n", - cpu_limit.value); - return; - } -@@ -395,11 +395,11 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu) - log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu); - else if (ret == HANDLE_SUCCEED) { - log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n", -- cpu, cpu_state[cpu_infos[cpu].state]); -+ cpu, cpu_state[cpu_infos[cpu].state]); - clear_queue(cpu_infos[cpu].ce_queue); - cpu_infos[cpu].ce_nums = 0; - cpu_infos[cpu].uce_nums = 0; - } else - log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n", -- cpu, cpu_state[cpu_infos[cpu].state]); -+ cpu, cpu_state[cpu_infos[cpu].state]); - } -diff --git a/ras-devlink-handler.c b/ras-devlink-handler.c -index e52d66e..b19ccaa 100644 ---- a/ras-devlink-handler.c -+++ b/ras-devlink-handler.c -@@ -38,7 +38,7 @@ int ras_net_xmit_timeout_handler(struct trace_seq *s, - struct devlink_event ev; - - if (ras->use_uptime) -- now = record->ts/user_hz + ras->uptime_diff; -+ now = record->ts / user_hz + ras->uptime_diff; - else - now = time(NULL); - -@@ -78,7 +78,6 @@ int ras_net_xmit_timeout_handler(struct trace_seq *s, - - free(ev.msg); - return 0; -- - } - - int ras_devlink_event_handler(struct trace_seq *s, -@@ -104,7 +103,7 @@ int ras_devlink_event_handler(struct trace_seq *s, - */ - - if (ras->use_uptime) -- now = record->ts/user_hz + ras->uptime_diff; -+ now = record->ts / user_hz + ras->uptime_diff; - else - now = time(NULL); - -diff --git a/ras-diskerror-handler.c b/ras-diskerror-handler.c -index b46f859..618afdb 100644 ---- a/ras-diskerror-handler.c -+++ b/ras-diskerror-handler.c -@@ -30,7 +30,6 @@ - #include "ras-logger.h" - #include "ras-report.h" - -- - static const struct { - int error; - const char *name; -@@ -82,7 +81,7 @@ int ras_diskerror_event_handler(struct trace_seq *s, - */ - - if (ras->use_uptime) -- now = record->ts/user_hz + ras->uptime_diff; -+ now = record->ts / user_hz + ras->uptime_diff; - else - now = time(NULL); - -diff --git a/ras-events.c b/ras-events.c -index 2cc54b3..c83b8de 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -58,7 +58,7 @@ - #define ENDIAN KBUFFER_ENDIAN_BIG - #endif - --extern char* choices_disable; -+extern char *choices_disable; - - static const struct event_trigger event_triggers[] = { - { "mc_event", &mc_event_trigger_setup }, -@@ -71,7 +71,7 @@ static int get_debugfs_dir(char *tracing_dir, size_t len) - char line[MAX_PATH + 1 + 256]; - char *p, *type, *dir; - -- fp = fopen("/proc/mounts","r"); -+ fp = fopen("/proc/mounts", "r"); - if (!fp) { - log(ALL, LOG_INFO, "Can't open /proc/mounts"); - return errno; -@@ -99,7 +99,7 @@ static int get_debugfs_dir(char *tracing_dir, size_t len) - tracing_dir[len - 1] = '\0'; - return 0; - } -- } while(1); -+ } while (1); - - fclose(fp); - log(ALL, LOG_INFO, "Can't find debugfs\n"); -@@ -144,7 +144,7 @@ static int get_tracing_dir(struct ras_events *ras) - strcat(ras->tracing, "/tracing"); - if (has_instances) { - strcat(ras->tracing, "/instances/" TOOL_NAME); -- rc = mkdir(ras->tracing, S_IRWXU); -+ rc = mkdir(ras->tracing, 0700); - if (rc < 0 && errno != EEXIST) { - log(ALL, LOG_INFO, - "Unable to create " TOOL_NAME " instance at %s\n", -@@ -155,13 +155,14 @@ static int get_tracing_dir(struct ras_events *ras) - return 0; - } - --static int is_disabled_event(char *group, char *event) { -+static int is_disabled_event(char *group, char *event) -+{ - char ras_event_name[MAX_PATH + 1]; - - snprintf(ras_event_name, sizeof(ras_event_name), "%s:%s", -- group, event); -+ group, event); - -- if (choices_disable != NULL && strlen(choices_disable) != 0 && strstr(choices_disable, ras_event_name)) { -+ if (choices_disable && strlen(choices_disable) != 0 && strstr(choices_disable, ras_event_name)) { - return 1; - } - return 0; -@@ -175,6 +176,7 @@ static int __toggle_ras_mc_event(struct ras_events *ras, - { - int fd, rc; - char fname[MAX_PATH + 1]; -+ - enable = is_disabled_event(group, event) ? 0 : 1; - - snprintf(fname, sizeof(fname), "%s%s:%s\n", -@@ -188,7 +190,7 @@ static int __toggle_ras_mc_event(struct ras_events *ras, - return errno; - } - -- rc = write(fd, fname,strlen(fname)); -+ rc = write(fd, fname, strlen(fname)); - if (rc < 0) { - log(ALL, LOG_WARNING, "Can't write to set_event\n"); - close(fd); -@@ -330,7 +332,6 @@ static int get_pagesize(struct ras_events *ras, struct pevent *pevent) - error: - close(fd); - return page_size; -- - } - - static void parse_ras_data(struct pthread_data *pdata, struct kbuffer *kbuf, -@@ -411,7 +412,7 @@ static int set_buffer_percent(struct ras_events *ras, int percent) - } - - static int read_ras_event_all_cpus(struct pthread_data *pdata, -- unsigned n_cpus) -+ unsigned int n_cpus) - { - ssize_t size; - unsigned long long time_stamp; -@@ -462,7 +463,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, - - /* FIXME: use select to open for all CPUs */ - snprintf(pipe_raw, sizeof(pipe_raw), -- "per_cpu/cpu%d/trace_pipe_raw", i); -+ "per_cpu/cpu%d/trace_pipe_raw", i); - - fds[i].fd = open_trace(pdata[0].ras, pipe_raw, O_RDONLY); - if (fds[i].fd < 0) { -@@ -512,7 +513,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, - fdsiginfo.ssi_signo == SIGTERM || - fdsiginfo.ssi_signo == SIGHUP || - fdsiginfo.ssi_signo == SIGQUIT) { -- log(TERM, LOG_INFO, "Recevied signal=%d\n", -+ log(TERM, LOG_INFO, "Received signal=%d\n", - fdsiginfo.ssi_signo); - goto cleanup; - } else { -@@ -717,7 +718,7 @@ static int select_tracing_timestamp(struct ras_events *ras) - int fd, rc; - time_t uptime, now; - size_t size; -- unsigned j1; -+ unsigned int j1; - char buf[4096]; - - /* Check if uptime is supported (kernel 3.10-rc1 or upper) */ -@@ -850,7 +851,7 @@ static int add_event_handler(struct ras_events *ras, struct pevent *pevent, - - if (is_disabled_event(group, event)) { - log(ALL, LOG_INFO, "Disabled %s:%s tracing from config\n", -- group, event); -+ group, event); - return -EINVAL; - } - -@@ -1043,7 +1044,6 @@ int handle_ras_events(int record_events) - if (!data) - goto err; - -- - for (i = 0; i < cpus; i++) { - data[i].ras = ras; - data[i].cpu = i; -@@ -1058,14 +1058,14 @@ int handle_ras_events(int record_events) - } - - log(SYSLOG, LOG_INFO, -- "Opening one thread per cpu (%d threads)\n", cpus); -+ "Opening one thread per cpu (%d threads)\n", cpus); - for (i = 0; i < cpus; i++) { - rc = pthread_create(&data[i].thread, NULL, -- handle_ras_events_cpu, -+ handle_ras_events_cpu, - (void *)&data[i]); - if (rc) { - log(SYSLOG, LOG_INFO, -- "Failed to create thread for cpu %d. Aborting.\n", -+ "Failed to create thread for cpu %d. Aborting.\n", - i); - while (--i) - pthread_cancel(data[i].thread); -diff --git a/ras-extlog-handler.c b/ras-extlog-handler.c -index 1834687..b40160a 100644 ---- a/ras-extlog-handler.c -+++ b/ras-extlog-handler.c -@@ -139,7 +139,7 @@ static char *err_cper_data(const char *c) - p += sprintf(p, "card_handle: %d ", cpd->mem_array_handle); - if (cpd->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) - p += sprintf(p, "module_handle: %d ", cpd->mem_dev_handle); -- p += sprintf(p-1, ")"); -+ p += sprintf(p - 1, ")"); - - return buf; - } -@@ -149,10 +149,10 @@ static char *uuid_le(const char *uu) - static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; - char *p = uuid; - int i; -- static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; -+ static const unsigned char le[16] = {3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15}; - - for (i = 0; i < 16; i++) { -- p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]); -+ p += sprintf(p, "%.2x", (unsigned char)uu[le[i]]); - switch (i) { - case 3: - case 5: -@@ -168,14 +168,13 @@ static char *uuid_le(const char *uu) - return uuid; - } - -- - static void report_extlog_mem_event(struct ras_events *ras, - struct pevent_record *record, - struct trace_seq *s, - struct ras_extlog_event *ev) - { - trace_seq_printf(s, "%d %s error: %s physical addr: 0x%llx mask: 0x%llx%s %s %s", -- ev->error_seq, err_severity(ev->severity), -+ ev->error_seq, err_severity(ev->severity), - err_type(ev->etype), ev->address, - err_mask(ev->pa_mask_lsb), - err_cper_data(ev->cper_data), -@@ -204,7 +203,7 @@ int ras_extlog_mem_event_handler(struct trace_seq *s, - */ - - if (ras->use_uptime) -- now = record->ts/user_hz + ras->uptime_diff; -+ now = record->ts / user_hz + ras->uptime_diff; - else - now = time(NULL); - -diff --git a/ras-mc-handler.c b/ras-mc-handler.c -index b62dfb6..7a4fa3c 100644 ---- a/ras-mc-handler.c -+++ b/ras-mc-handler.c -@@ -134,7 +134,7 @@ int ras_mc_event_handler(struct trace_seq *s, - */ - - if (ras->use_uptime) -- now = record->ts/user_hz + ras->uptime_diff; -+ now = record->ts / user_hz + ras->uptime_diff; - else - now = time(NULL); - -@@ -207,22 +207,22 @@ int ras_mc_event_handler(struct trace_seq *s, - if (pevent_get_field_val(s, event, "top_layer", record, &val, 1) < 0) - goto parse_error; - parsed_fields++; -- ev.top_layer = (signed char) val; -+ ev.top_layer = (signed char)val; - - if (pevent_get_field_val(s, event, "middle_layer", record, &val, 1) < 0) - goto parse_error; - parsed_fields++; -- ev.middle_layer = (signed char) val; -+ ev.middle_layer = (signed char)val; - - if (pevent_get_field_val(s, event, "lower_layer", record, &val, 1) < 0) - goto parse_error; - parsed_fields++; -- ev.lower_layer = (signed char) val; -+ ev.lower_layer = (signed char)val; - - if (ev.top_layer >= 0 || ev.middle_layer >= 0 || ev.lower_layer >= 0) { - if (ev.lower_layer >= 0) - trace_seq_printf(s, " location: %d:%d:%d", -- ev.top_layer, ev.middle_layer, ev.lower_layer); -+ ev.top_layer, ev.middle_layer, ev.lower_layer); - else if (ev.middle_layer >= 0) - trace_seq_printf(s, " location: %d:%d", - ev.top_layer, ev.middle_layer); -diff --git a/ras-mce-handler.c b/ras-mce-handler.c -index 370e68a..f55c732 100644 ---- a/ras-mce-handler.c -+++ b/ras-mce-handler.c -@@ -114,11 +114,11 @@ static enum cputype select_intel_cputype(struct mce_priv *mce) - else if (mce->model == 0x6a) - return CPU_ICELAKE_XEON; - else if (mce->model == 0x6c) -- return CPU_ICELAKE_DE; -+ return CPU_ICELAKE_DE; - else if (mce->model == 0x86) -- return CPU_TREMONT_D; -+ return CPU_TREMONT_D; - else if (mce->model == 0x8f) -- return CPU_SAPPHIRERAPIDS; -+ return CPU_SAPPHIRERAPIDS; - else if (mce->model == 0xcf) - return CPU_EMERALDRAPIDS; - -@@ -161,7 +161,7 @@ static int detect_cpu(struct mce_priv *mce) - mce->mhz = 0; - mce->vendor[0] = '\0'; - -- f = fopen("/proc/cpuinfo","r"); -+ f = fopen("/proc/cpuinfo", "r"); - if (!f) { - log(ALL, LOG_INFO, "Can't open /proc/cpuinfo\n"); - return errno; -@@ -169,7 +169,7 @@ static int detect_cpu(struct mce_priv *mce) - - while (seen != CPU_ALL && getdelim(&line, &linelen, '\n', f) > 0) { - if (sscanf(line, "vendor_id : %63[^\n]", -- (char *)&mce->vendor) == 1) -+ (char *)&mce->vendor) == 1) - seen |= CPU_VENDOR; - else if (sscanf(line, "cpu family : %d", &mce->family) == 1) - seen |= CPU_FAMILY; -@@ -189,7 +189,7 @@ static int detect_cpu(struct mce_priv *mce) - - if (seen != CPU_ALL) { - log(ALL, LOG_INFO, "Can't parse /proc/cpuinfo: missing%s%s%s%s%s\n", -- (seen & CPU_VENDOR) ? "" : " [vendor_id]", -+ (seen & CPU_VENDOR) ? "" : " [vendor_id]", - (seen & CPU_FAMILY) ? "" : " [cpu family]", - (seen & CPU_MODEL) ? "" : " [model]", - (seen & CPU_MHZ) ? "" : " [cpu MHz]", -@@ -215,12 +215,12 @@ static int detect_cpu(struct mce_priv *mce) - ret = EINVAL; - } - goto ret; -- } else if (!strcmp(mce->vendor,"HygonGenuine")) { -+ } else if (!strcmp(mce->vendor, "HygonGenuine")) { - if (mce->family == 24) { - mce->cputype = CPU_DHYANA; - } - goto ret; -- } else if (!strcmp(mce->vendor,"GenuineIntel")) { -+ } else if (!strcmp(mce->vendor, "GenuineIntel")) { - mce->cputype = select_intel_cputype(mce); - } else { - ret = EINVAL; -@@ -233,7 +233,7 @@ ret: - return ret; - } - --int register_mce_handler(struct ras_events *ras, unsigned ncpus) -+int register_mce_handler(struct ras_events *ras, unsigned int ncpus) - { - int rc; - struct mce_priv *mce; -@@ -249,8 +249,8 @@ int register_mce_handler(struct ras_events *ras, unsigned ncpus) - rc = detect_cpu(mce); - if (rc) { - if (mce->processor_flags) -- free (mce->processor_flags); -- free (ras->mce_priv); -+ free(mce->processor_flags); -+ free(ras->mce_priv); - ras->mce_priv = NULL; - return (rc); - } -@@ -290,7 +290,7 @@ static void report_mce_event(struct ras_events *ras, - */ - - if (ras->use_uptime) -- now = record->ts/user_hz + ras->uptime_diff; -+ now = record->ts / user_hz + ras->uptime_diff; - else - now = time(NULL); - -@@ -459,7 +459,7 @@ int ras_offline_mce_event(struct ras_mc_offline_event *event) - mce->ipid = event->ipid; - if (!mce->ipid || !mce->status) { - log(TERM, LOG_ERR, "%s MSR required.\n", -- mce->ipid ? "Status" : "Ipid"); -+ mce->ipid ? "Status" : "Ipid"); - rc = -EINVAL; - goto free_mce; - } -diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c -index d6e83a9..855c08c 100644 ---- a/ras-memory-failure-handler.c -+++ b/ras-memory-failure-handler.c -@@ -167,7 +167,6 @@ static const char *get_action_result(int result) - return "unknown"; - } - -- - int ras_memory_failure_event_handler(struct trace_seq *s, - struct pevent_record *record, - struct event_format *event, void *context) -@@ -188,7 +187,7 @@ int ras_memory_failure_event_handler(struct trace_seq *s, - */ - - if (ras->use_uptime) -- now = record->ts/user_hz + ras->uptime_diff; -+ now = record->ts / user_hz + ras->uptime_diff; - else - now = time(NULL); - -diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c -index 3a4e300..968bd56 100644 ---- a/ras-non-standard-handler.c -+++ b/ras-non-standard-handler.c -@@ -24,8 +24,9 @@ - - static struct ras_ns_ev_decoder *ras_ns_ev_dec_list; - --void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index) { -- trace_seq_printf(s, "%02x%02x%02x%02x", buf[index+3], buf[index+2], buf[index+1], buf[index]); -+void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index) -+{ -+ trace_seq_printf(s, "%02x%02x%02x%02x", buf[index + 3], buf[index + 2], buf[index + 1], buf[index]); - } - - static char *uuid_le(const char *uu) -@@ -33,10 +34,10 @@ static char *uuid_le(const char *uu) - static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; - char *p = uuid; - int i; -- static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; -+ static const unsigned char le[16] = {3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15}; - - for (i = 0; i < 16; i++) { -- p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]); -+ p += sprintf(p, "%.2x", (unsigned char)uu[le[i]]); - switch (i) { - case 3: - case 5: -@@ -183,7 +184,7 @@ int ras_non_standard_event_handler(struct trace_seq *s, - */ - - if (ras->use_uptime) -- now = record->ts/user_hz + ras->uptime_diff; -+ now = record->ts / user_hz + ras->uptime_diff; - else - now = time(NULL); - -diff --git a/ras-page-isolation.c b/ras-page-isolation.c -index caa8c31..89f8c15 100644 ---- a/ras-page-isolation.c -+++ b/ras-page-isolation.c -@@ -116,7 +116,7 @@ static void parse_isolation_env(struct isolation *config) - int unit_matched = 0; - unsigned long value, tmp; - -- /* check if env is vaild */ -+ /* check if env is valid */ - if (env && strlen(env)) { - /* All the character before unit must be digit */ - for (i = 0; i < strlen(env) - 1; i++) { -@@ -125,7 +125,7 @@ static void parse_isolation_env(struct isolation *config) - } - if (sscanf(env, "%lu", &value) < 1 || !value) - goto parse; -- /* check if the unit is vaild */ -+ /* check if the unit is valid */ - unit = env + strlen(env) - 1; - /* no unit, all the character are value character */ - if (isdigit(*unit)) { -@@ -151,7 +151,7 @@ parse: - config->unit = unit; - } else { - log(TERM, LOG_INFO, "Improper %s, set to default %s.\n", -- config->name, config->env); -+ config->name, config->env); - } - - /* if env value string is greater than ulong_max, truncate the last digit */ -@@ -177,10 +177,11 @@ static void parse_env_string(struct isolation *config, char *str, unsigned int s - - if (config->overflow) { - /* when overflow, use basic unit */ -- for (i = 0; config->units[i].name; i++) ; -- snprintf(str, size, "%lu%s", config->val, config->units[i-1].name); -+ for (i = 0; config->units[i].name; i++) -+ ; -+ snprintf(str, size, "%lu%s", config->val, config->units[i - 1].name); - log(TERM, LOG_INFO, "%s is set overflow(%s), truncate it\n", -- config->name, config->env); -+ config->name, config->env); - } else { - snprintf(str, size, "%s%s", config->env, config->unit); - } -@@ -202,7 +203,7 @@ static void page_isolation_init(void) - parse_env_string(&threshold, threshold_string, sizeof(threshold_string)); - parse_env_string(&cycle, cycle_string, sizeof(cycle_string)); - log(TERM, LOG_INFO, "Threshold of memory Corrected Errors is %s / %s\n", -- threshold_string, cycle_string); -+ threshold_string, cycle_string); - } - - void ras_page_account_init(void) -@@ -239,7 +240,7 @@ static void page_offline(struct page_record *pr) - /* Offlining page is not required */ - if (offline <= OFFLINE_ACCOUNT) { - log(TERM, LOG_INFO, "PAGE_CE_ACTION=%s, ignore to offline page at %#llx\n", -- offline_choice[offline].name, addr); -+ offline_choice[offline].name, addr); - return; - } - -@@ -264,7 +265,7 @@ static void page_offline(struct page_record *pr) - addr, page_state[pr->offlined]); - } - --static void page_record(struct page_record *pr, unsigned count, time_t time) -+static void page_record(struct page_record *pr, unsigned int count, time_t time) - { - unsigned long period = time - pr->start; - unsigned long tolerate; -@@ -328,7 +329,7 @@ static struct page_record *page_lookup_insert(unsigned long long addr) - return find; - } - --void ras_record_page_error(unsigned long long addr, unsigned count, time_t time) -+void ras_record_page_error(unsigned long long addr, unsigned int count, time_t time) - { - struct page_record *pr = NULL; - -diff --git a/ras-record.c b/ras-record.c -index adb00ca..0409099 100644 ---- a/ras-record.c -+++ b/ras-record.c -@@ -42,20 +42,20 @@ - */ - - static const struct db_fields mc_event_fields[] = { -- { .name="id", .type="INTEGER PRIMARY KEY" }, -- { .name="timestamp", .type="TEXT" }, -- { .name="err_count", .type="INTEGER" }, -- { .name="err_type", .type="TEXT" }, -- { .name="err_msg", .type="TEXT" }, -- { .name="label", .type="TEXT" }, -- { .name="mc", .type="INTEGER" }, -- { .name="top_layer", .type="INTEGER" }, -- { .name="middle_layer", .type="INTEGER" }, -- { .name="lower_layer", .type="INTEGER" }, -- { .name="address", .type="INTEGER" }, -- { .name="grain", .type="INTEGER" }, -- { .name="syndrome", .type="INTEGER" }, -- { .name="driver_detail", .type="TEXT" }, -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, -+ { .name = "err_count", .type = "INTEGER" }, -+ { .name = "err_type", .type = "TEXT" }, -+ { .name = "err_msg", .type = "TEXT" }, -+ { .name = "label", .type = "TEXT" }, -+ { .name = "mc", .type = "INTEGER" }, -+ { .name = "top_layer", .type = "INTEGER" }, -+ { .name = "middle_layer", .type = "INTEGER" }, -+ { .name = "lower_layer", .type = "INTEGER" }, -+ { .name = "address", .type = "INTEGER" }, -+ { .name = "grain", .type = "INTEGER" }, -+ { .name = "syndrome", .type = "INTEGER" }, -+ { .name = "driver_detail", .type = "TEXT" }, - }; - - static const struct db_table_descriptor mc_event_tab = { -@@ -82,9 +82,9 @@ int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev) - sqlite3_bind_int (priv->stmt_mc_event, 7, ev->top_layer); - sqlite3_bind_int (priv->stmt_mc_event, 8, ev->middle_layer); - sqlite3_bind_int (priv->stmt_mc_event, 9, ev->lower_layer); -- sqlite3_bind_int64 (priv->stmt_mc_event, 10, ev->address); -- sqlite3_bind_int64 (priv->stmt_mc_event, 11, ev->grain); -- sqlite3_bind_int64 (priv->stmt_mc_event, 12, ev->syndrome); -+ sqlite3_bind_int64(priv->stmt_mc_event, 10, ev->address); -+ sqlite3_bind_int64(priv->stmt_mc_event, 11, ev->grain); -+ sqlite3_bind_int64(priv->stmt_mc_event, 12, ev->syndrome); - sqlite3_bind_text(priv->stmt_mc_event, 13, ev->driver_detail, -1, NULL); - rc = sqlite3_step(priv->stmt_mc_event); - if (rc != SQLITE_OK && rc != SQLITE_DONE) -@@ -106,11 +106,11 @@ int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev) - - #ifdef HAVE_AER - static const struct db_fields aer_event_fields[] = { -- { .name="id", .type="INTEGER PRIMARY KEY" }, -- { .name="timestamp", .type="TEXT" }, -- { .name="dev_name", .type="TEXT" }, -- { .name="err_type", .type="TEXT" }, -- { .name="err_msg", .type="TEXT" }, -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, -+ { .name = "dev_name", .type = "TEXT" }, -+ { .name = "err_type", .type = "TEXT" }, -+ { .name = "err_msg", .type = "TEXT" }, - }; - - static const struct db_table_descriptor aer_event_tab = { -@@ -154,13 +154,13 @@ int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) - - #ifdef HAVE_NON_STANDARD - static const struct db_fields non_standard_event_fields[] = { -- { .name="id", .type="INTEGER PRIMARY KEY" }, -- { .name="timestamp", .type="TEXT" }, -- { .name="sec_type", .type="BLOB" }, -- { .name="fru_id", .type="BLOB" }, -- { .name="fru_text", .type="TEXT" }, -- { .name="severity", .type="TEXT" }, -- { .name="error", .type="BLOB" }, -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, -+ { .name = "sec_type", .type = "BLOB" }, -+ { .name = "fru_id", .type = "BLOB" }, -+ { .name = "fru_text", .type = "TEXT" }, -+ { .name = "severity", .type = "TEXT" }, -+ { .name = "error", .type = "BLOB" }, - }; - - static const struct db_table_descriptor non_standard_event_tab = { -@@ -178,12 +178,12 @@ int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standar - return 0; - log(TERM, LOG_INFO, "non_standard_event store: %p\n", priv->stmt_non_standard_record); - -- sqlite3_bind_text (priv->stmt_non_standard_record, 1, ev->timestamp, -1, NULL); -- sqlite3_bind_blob (priv->stmt_non_standard_record, 2, ev->sec_type, -1, NULL); -- sqlite3_bind_blob (priv->stmt_non_standard_record, 3, ev->fru_id, 16, NULL); -- sqlite3_bind_text (priv->stmt_non_standard_record, 4, ev->fru_text, -1, NULL); -- sqlite3_bind_text (priv->stmt_non_standard_record, 5, ev->severity, -1, NULL); -- sqlite3_bind_blob (priv->stmt_non_standard_record, 6, ev->error, ev->length, NULL); -+ sqlite3_bind_text(priv->stmt_non_standard_record, 1, ev->timestamp, -1, NULL); -+ sqlite3_bind_blob(priv->stmt_non_standard_record, 2, ev->sec_type, -1, NULL); -+ sqlite3_bind_blob(priv->stmt_non_standard_record, 3, ev->fru_id, 16, NULL); -+ sqlite3_bind_text(priv->stmt_non_standard_record, 4, ev->fru_text, -1, NULL); -+ sqlite3_bind_text(priv->stmt_non_standard_record, 5, ev->severity, -1, NULL); -+ sqlite3_bind_blob(priv->stmt_non_standard_record, 6, ev->error, ev->length, NULL); - - rc = sqlite3_step(priv->stmt_non_standard_record); - if (rc != SQLITE_OK && rc != SQLITE_DONE) -@@ -205,16 +205,16 @@ int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standar - - #ifdef HAVE_ARM - static const struct db_fields arm_event_fields[] = { -- { .name="id", .type="INTEGER PRIMARY KEY" }, -- { .name="timestamp", .type="TEXT" }, -- { .name="error_count", .type="INTEGER" }, -- { .name="affinity", .type="INTEGER" }, -- { .name="mpidr", .type="INTEGER" }, -- { .name="running_state", .type="INTEGER" }, -- { .name="psci_state", .type="INTEGER" }, -- { .name="err_info", .type="BLOB" }, -- { .name="context_info", .type="BLOB" }, -- { .name="vendor_info", .type="BLOB" }, -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, -+ { .name = "error_count", .type = "INTEGER" }, -+ { .name = "affinity", .type = "INTEGER" }, -+ { .name = "mpidr", .type = "INTEGER" }, -+ { .name = "running_state", .type = "INTEGER" }, -+ { .name = "psci_state", .type = "INTEGER" }, -+ { .name = "err_info", .type = "BLOB" }, -+ { .name = "context_info", .type = "BLOB" }, -+ { .name = "vendor_info", .type = "BLOB" }, - }; - - static const struct db_table_descriptor arm_event_tab = { -@@ -232,18 +232,18 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) - return 0; - log(TERM, LOG_INFO, "arm_event store: %p\n", priv->stmt_arm_record); - -- sqlite3_bind_text (priv->stmt_arm_record, 1, ev->timestamp, -1, NULL); -+ sqlite3_bind_text(priv->stmt_arm_record, 1, ev->timestamp, -1, NULL); - sqlite3_bind_int (priv->stmt_arm_record, 2, ev->error_count); - sqlite3_bind_int (priv->stmt_arm_record, 3, ev->affinity); -- sqlite3_bind_int64 (priv->stmt_arm_record, 4, ev->mpidr); -+ sqlite3_bind_int64(priv->stmt_arm_record, 4, ev->mpidr); - sqlite3_bind_int (priv->stmt_arm_record, 5, ev->running_state); - sqlite3_bind_int (priv->stmt_arm_record, 6, ev->psci_state); -- sqlite3_bind_blob (priv->stmt_arm_record, 7, -- ev->pei_error, ev->pei_len, NULL); -- sqlite3_bind_blob (priv->stmt_arm_record, 8, -- ev->ctx_error, ev->ctx_len, NULL); -- sqlite3_bind_blob (priv->stmt_arm_record, 9, -- ev->vsei_error, ev->oem_len, NULL); -+ sqlite3_bind_blob(priv->stmt_arm_record, 7, -+ ev->pei_error, ev->pei_len, NULL); -+ sqlite3_bind_blob(priv->stmt_arm_record, 8, -+ ev->ctx_error, ev->ctx_len, NULL); -+ sqlite3_bind_blob(priv->stmt_arm_record, 9, -+ ev->vsei_error, ev->oem_len, NULL); - - rc = sqlite3_step(priv->stmt_arm_record); - if (rc != SQLITE_OK && rc != SQLITE_DONE) -@@ -262,15 +262,15 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) - - #ifdef HAVE_EXTLOG - static const struct db_fields extlog_event_fields[] = { -- { .name="id", .type="INTEGER PRIMARY KEY" }, -- { .name="timestamp", .type="TEXT" }, -- { .name="etype", .type="INTEGER" }, -- { .name="error_count", .type="INTEGER" }, -- { .name="severity", .type="INTEGER" }, -- { .name="address", .type="INTEGER" }, -- { .name="fru_id", .type="BLOB" }, -- { .name="fru_text", .type="TEXT" }, -- { .name="cper_data", .type="BLOB" }, -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, -+ { .name = "etype", .type = "INTEGER" }, -+ { .name = "error_count", .type = "INTEGER" }, -+ { .name = "severity", .type = "INTEGER" }, -+ { .name = "address", .type = "INTEGER" }, -+ { .name = "fru_id", .type = "BLOB" }, -+ { .name = "fru_text", .type = "TEXT" }, -+ { .name = "cper_data", .type = "BLOB" }, - }; - - static const struct db_table_descriptor extlog_event_tab = { -@@ -288,14 +288,14 @@ int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event - return 0; - log(TERM, LOG_INFO, "extlog_record store: %p\n", priv->stmt_extlog_record); - -- sqlite3_bind_text (priv->stmt_extlog_record, 1, ev->timestamp, -1, NULL); -+ sqlite3_bind_text(priv->stmt_extlog_record, 1, ev->timestamp, -1, NULL); - sqlite3_bind_int (priv->stmt_extlog_record, 2, ev->etype); - sqlite3_bind_int (priv->stmt_extlog_record, 3, ev->error_seq); - sqlite3_bind_int (priv->stmt_extlog_record, 4, ev->severity); -- sqlite3_bind_int64 (priv->stmt_extlog_record, 5, ev->address); -- sqlite3_bind_blob (priv->stmt_extlog_record, 6, ev->fru_id, 16, NULL); -- sqlite3_bind_text (priv->stmt_extlog_record, 7, ev->fru_text, -1, NULL); -- sqlite3_bind_blob (priv->stmt_extlog_record, 8, ev->cper_data, ev->cper_data_length, NULL); -+ sqlite3_bind_int64(priv->stmt_extlog_record, 5, ev->address); -+ sqlite3_bind_blob(priv->stmt_extlog_record, 6, ev->fru_id, 16, NULL); -+ sqlite3_bind_text(priv->stmt_extlog_record, 7, ev->fru_text, -1, NULL); -+ sqlite3_bind_blob(priv->stmt_extlog_record, 8, ev->cper_data, ev->cper_data_length, NULL); - - rc = sqlite3_step(priv->stmt_extlog_record); - if (rc != SQLITE_OK && rc != SQLITE_DONE) -@@ -318,8 +318,8 @@ int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event - - #ifdef HAVE_MCE - static const struct db_fields mce_record_fields[] = { -- { .name="id", .type="INTEGER PRIMARY KEY" }, -- { .name="timestamp", .type="TEXT" }, -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, - - /* MCE registers */ - { .name="mcgcap", .type="INTEGER" }, -@@ -363,7 +363,7 @@ int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev) - return 0; - log(TERM, LOG_INFO, "mce_record store: %p\n", priv->stmt_mce_record); - -- sqlite3_bind_text (priv->stmt_mce_record, 1, ev->timestamp, -1, NULL); -+ sqlite3_bind_text(priv->stmt_mce_record, 1, ev->timestamp, -1, NULL); - sqlite3_bind_int (priv->stmt_mce_record, 2, ev->mcgcap); - sqlite3_bind_int (priv->stmt_mce_record, 3, ev->mcgstatus); - sqlite3_bind_int64 (priv->stmt_mce_record, 4, ev->status); -@@ -409,13 +409,13 @@ int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev) - - #ifdef HAVE_DEVLINK - static const struct db_fields devlink_event_fields[] = { -- { .name="id", .type="INTEGER PRIMARY KEY" }, -- { .name="timestamp", .type="TEXT" }, -- { .name="bus_name", .type="TEXT" }, -- { .name="dev_name", .type="TEXT" }, -- { .name="driver_name", .type="TEXT" }, -- { .name="reporter_name", .type="TEXT" }, -- { .name="msg", .type="TEXT" }, -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, -+ { .name = "bus_name", .type = "TEXT" }, -+ { .name = "dev_name", .type = "TEXT" }, -+ { .name = "driver_name", .type = "TEXT" }, -+ { .name = "reporter_name", .type = "TEXT" }, -+ { .name = "msg", .type = "TEXT" }, - }; - - static const struct db_table_descriptor devlink_event_tab = { -@@ -461,14 +461,14 @@ int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev) - - #ifdef HAVE_DISKERROR - static const struct db_fields diskerror_event_fields[] = { -- { .name="id", .type="INTEGER PRIMARY KEY" }, -- { .name="timestamp", .type="TEXT" }, -- { .name="dev", .type="TEXT" }, -- { .name="sector", .type="INTEGER" }, -- { .name="nr_sector", .type="INTEGER" }, -- { .name="error", .type="TEXT" }, -- { .name="rwbs", .type="TEXT" }, -- { .name="cmd", .type="TEXT" }, -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, -+ { .name = "dev", .type = "TEXT" }, -+ { .name = "sector", .type = "INTEGER" }, -+ { .name = "nr_sector", .type = "INTEGER" }, -+ { .name = "error", .type = "TEXT" }, -+ { .name = "rwbs", .type = "TEXT" }, -+ { .name = "cmd", .type = "TEXT" }, - }; - - static const struct db_table_descriptor diskerror_event_tab = { -@@ -515,11 +515,11 @@ int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev - - #ifdef HAVE_MEMORY_FAILURE - static const struct db_fields mf_event_fields[] = { -- { .name="id", .type="INTEGER PRIMARY KEY" }, -- { .name="timestamp", .type="TEXT" }, -- { .name="pfn", .type="TEXT" }, -- { .name="page_type", .type="TEXT" }, -- { .name="action_result", .type="TEXT" }, -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, -+ { .name = "pfn", .type = "TEXT" }, -+ { .name = "page_type", .type = "TEXT" }, -+ { .name = "action_result", .type = "TEXT" }, - }; - - static const struct db_table_descriptor mf_event_tab = { -@@ -664,7 +664,7 @@ static int ras_mc_alter_table(struct sqlite3_priv *priv, - found = 0; - for (j = 0; j < col_count; j++) { - if (!strcmp(field->name, -- sqlite3_column_name(*stmt, j))) { -+ sqlite3_column_name(*stmt, j))) { - found = 1; - break; - } -@@ -755,13 +755,13 @@ int ras_mc_finalize_vendor_table(sqlite3_stmt *stmt) - return rc; - } - --int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) -+int ras_mc_event_opendb(unsigned int cpu, struct ras_events *ras) - { - int rc; - sqlite3 *db; - struct sqlite3_priv *priv; - -- printf("Calling %s()\n", __FUNCTION__); -+ printf("Calling %s()\n", __func__); - - ras->db_ref_count++; - if (ras->db_ref_count > 1) -@@ -774,6 +774,7 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) - return -1; - - struct stat st = {0}; -+ - if (stat(RASSTATEDIR, &st) == -1) { - if (errno != ENOENT) { - log(TERM, LOG_ERR, -@@ -855,7 +856,7 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) - rc = ras_mc_create_table(priv, &non_standard_event_tab); - if (rc == SQLITE_OK) { - rc = ras_mc_prepare_stmt(priv, &priv->stmt_non_standard_record, -- &non_standard_event_tab); -+ &non_standard_event_tab); - if (rc != SQLITE_OK) - goto error; - } -@@ -865,7 +866,7 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) - rc = ras_mc_create_table(priv, &arm_event_tab); - if (rc == SQLITE_OK) { - rc = ras_mc_prepare_stmt(priv, &priv->stmt_arm_record, -- &arm_event_tab); -+ &arm_event_tab); - if (rc != SQLITE_OK) - goto error; - } -@@ -874,7 +875,7 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) - rc = ras_mc_create_table(priv, &devlink_event_tab); - if (rc == SQLITE_OK) { - rc = ras_mc_prepare_stmt(priv, &priv->stmt_devlink_event, -- &devlink_event_tab); -+ &devlink_event_tab); - if (rc != SQLITE_OK) - goto error; - } -@@ -884,7 +885,7 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) - rc = ras_mc_create_table(priv, &diskerror_event_tab); - if (rc == SQLITE_OK) { - rc = ras_mc_prepare_stmt(priv, &priv->stmt_diskerror_event, -- &diskerror_event_tab); -+ &diskerror_event_tab); - if (rc != SQLITE_OK) - goto error; - } -@@ -958,7 +959,6 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) - } - #endif - -- - #ifdef HAVE_MCE - if (priv->stmt_mce_record) { - rc = sqlite3_finalize(priv->stmt_mce_record); -diff --git a/ras-report.c b/ras-report.c -index 62d5eb7..6e3b351 100644 ---- a/ras-report.c -+++ b/ras-report.c -@@ -21,13 +21,14 @@ - - #include "ras-report.h" - --static int setup_report_socket(void){ -+static int setup_report_socket(void) -+{ - int sockfd = -1; - int rc = -1; - struct sockaddr_un addr; - - sockfd = socket(AF_UNIX, SOCK_STREAM, 0); -- if (sockfd < 0){ -+ if (sockfd < 0) { - return -1; - } - -@@ -45,12 +46,13 @@ static int setup_report_socket(void){ - return sockfd; - } - --static int commit_report_basic(int sockfd){ -+static int commit_report_basic(int sockfd) -+{ - char buf[INPUT_BUFFER_SIZE]; - struct utsname un; - int rc = -1; - -- if(sockfd < 0){ -+ if (sockfd < 0) { - return rc; - } - -@@ -58,7 +60,7 @@ static int commit_report_basic(int sockfd){ - memset(&un, 0, sizeof(struct utsname)); - - rc = uname(&un); -- if(rc < 0){ -+ if (rc < 0) { - return rc; - } - -@@ -67,35 +69,36 @@ static int commit_report_basic(int sockfd){ - */ - sprintf(buf, "PUT / HTTP/1.1\r\n\r\n"); - rc = write(sockfd, buf, strlen(buf)); -- if(rc < strlen(buf)){ -+ if (rc < strlen(buf)) { - return -1; - } - - sprintf(buf, "PID=%d", (int)getpid()); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - return -1; - } - - sprintf(buf, "EXECUTABLE=/boot/vmlinuz-%s", un.release); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - return -1; - } - - sprintf(buf, "TYPE=%s", "ras"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - return -1; - } - - return 0; - } - --static int set_mc_event_backtrace(char *buf, struct ras_mc_event *ev){ -+static int set_mc_event_backtrace(char *buf, struct ras_mc_event *ev) -+{ - char bt_buf[MAX_BACKTRACE_SIZE]; - -- if(!buf || !ev) -+ if (!buf || !ev) - return -1; - - sprintf(bt_buf, "BACKTRACE=" \ -@@ -131,10 +134,11 @@ static int set_mc_event_backtrace(char *buf, struct ras_mc_event *ev){ - return 0; - } - --static int set_mce_event_backtrace(char *buf, struct mce_event *ev){ -+static int set_mce_event_backtrace(char *buf, struct mce_event *ev) -+{ - char bt_buf[MAX_BACKTRACE_SIZE]; - -- if(!buf || !ev) -+ if (!buf || !ev) - return -1; - - sprintf(bt_buf, "BACKTRACE=" \ -@@ -190,10 +194,11 @@ static int set_mce_event_backtrace(char *buf, struct mce_event *ev){ - return 0; - } - --static int set_aer_event_backtrace(char *buf, struct ras_aer_event *ev){ -+static int set_aer_event_backtrace(char *buf, struct ras_aer_event *ev) -+{ - char bt_buf[MAX_BACKTRACE_SIZE]; - -- if(!buf || !ev) -+ if (!buf || !ev) - return -1; - - sprintf(bt_buf, "BACKTRACE=" \ -@@ -211,10 +216,11 @@ static int set_aer_event_backtrace(char *buf, struct ras_aer_event *ev){ - return 0; - } - --static int set_non_standard_event_backtrace(char *buf, struct ras_non_standard_event *ev){ -+static int set_non_standard_event_backtrace(char *buf, struct ras_non_standard_event *ev) -+{ - char bt_buf[MAX_BACKTRACE_SIZE]; - -- if(!buf || !ev) -+ if (!buf || !ev) - return -1; - - sprintf(bt_buf, "BACKTRACE=" \ -@@ -230,10 +236,11 @@ static int set_non_standard_event_backtrace(char *buf, struct ras_non_standard_e - return 0; - } - --static int set_arm_event_backtrace(char *buf, struct ras_arm_event *ev){ -+static int set_arm_event_backtrace(char *buf, struct ras_arm_event *ev) -+{ - char bt_buf[MAX_BACKTRACE_SIZE]; - -- if(!buf || !ev) -+ if (!buf || !ev) - return -1; - - sprintf(bt_buf, "BACKTRACE=" \ -@@ -257,10 +264,11 @@ static int set_arm_event_backtrace(char *buf, struct ras_arm_event *ev){ - return 0; - } - --static int set_devlink_event_backtrace(char *buf, struct devlink_event *ev){ -+static int set_devlink_event_backtrace(char *buf, struct devlink_event *ev) -+{ - char bt_buf[MAX_BACKTRACE_SIZE]; - -- if(!buf || !ev) -+ if (!buf || !ev) - return -1; - - sprintf(bt_buf, "BACKTRACE=" \ -@@ -282,10 +290,11 @@ static int set_devlink_event_backtrace(char *buf, struct devlink_event *ev){ - return 0; - } - --static int set_diskerror_event_backtrace(char *buf, struct diskerror_event *ev) { -+static int set_diskerror_event_backtrace(char *buf, struct diskerror_event *ev) -+{ - char bt_buf[MAX_BACKTRACE_SIZE]; - -- if(!buf || !ev) -+ if (!buf || !ev) - return -1; - - sprintf(bt_buf, "BACKTRACE=" \ -@@ -317,14 +326,14 @@ static int set_mf_event_backtrace(char *buf, struct ras_mf_event *ev) - return -1; - - sprintf(bt_buf, "BACKTRACE=" \ -- "timestamp=%s\n" \ -- "pfn=%s\n" \ -- "page_type=%s\n" \ -- "action_result=%s\n", \ -- ev->timestamp, \ -- ev->pfn, \ -- ev->page_type, \ -- ev->action_result); -+ "timestamp=%s\n" \ -+ "pfn=%s\n" \ -+ "page_type=%s\n" \ -+ "action_result=%s\n", \ -+ ev->timestamp, \ -+ ev->pfn, \ -+ ev->page_type, \ -+ ev->action_result); - - strcat(buf, bt_buf); - -@@ -337,13 +346,13 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ - int rc = -1; - int buf_len = 0; - -- if(sockfd < 0 || !ev){ -+ if (sockfd < 0 || !ev) { - return -1; - } - - memset(buf, 0, MAX_BACKTRACE_SIZE); - -- switch(type){ -+ switch (type) { - case MC_EVENT: - rc = set_mc_event_backtrace(buf, (struct ras_mc_event *)ev); - break; -@@ -372,15 +381,15 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ - return -1; - } - -- if(rc < 0){ -+ if (rc < 0) { - return -1; - } - - buf_len = strlen(buf); - -- for(;buf_len > INPUT_BUFFER_SIZE - 1; buf_len -= (INPUT_BUFFER_SIZE - 1)){ -+ for (; buf_len > INPUT_BUFFER_SIZE - 1; buf_len -= (INPUT_BUFFER_SIZE - 1)) { - rc = write(sockfd, pbuf, INPUT_BUFFER_SIZE - 1); -- if(rc < INPUT_BUFFER_SIZE - 1){ -+ if (rc < INPUT_BUFFER_SIZE - 1) { - return -1; - } - -@@ -388,14 +397,15 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ - } - - rc = write(sockfd, pbuf, buf_len + 1); -- if(rc < buf_len){ -+ if (rc < buf_len) { - return -1; - } - - return 0; - } - --int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev){ -+int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev) -+{ - char buf[MAX_MESSAGE_SIZE]; - int sockfd = -1; - int done = 0; -@@ -404,29 +414,29 @@ int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev){ - memset(buf, 0, sizeof(buf)); - - sockfd = setup_report_socket(); -- if(sockfd < 0){ -+ if (sockfd < 0) { - return -1; - } - - rc = commit_report_basic(sockfd); -- if(rc < 0){ -+ if (rc < 0) { - goto mc_fail; - } - - rc = commit_report_backtrace(sockfd, MC_EVENT, ev); -- if(rc < 0){ -+ if (rc < 0) { - goto mc_fail; - } - - sprintf(buf, "ANALYZER=%s", "rasdaemon-mc"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - goto mc_fail; - } - - sprintf(buf, "REASON=%s", "EDAC driver report problem"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - goto mc_fail; - } - -@@ -434,18 +444,19 @@ int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev){ - - mc_fail: - -- if(sockfd >= 0){ -+ if (sockfd >= 0) { - close(sockfd); - } - -- if(done){ -+ if (done) { - return 0; -- }else{ -+ } else { - return -1; - } - } - --int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev){ -+int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev) -+{ - char buf[MAX_MESSAGE_SIZE]; - int sockfd = 0; - int done = 0; -@@ -454,29 +465,29 @@ int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev){ - memset(buf, 0, sizeof(buf)); - - sockfd = setup_report_socket(); -- if(sockfd < 0){ -+ if (sockfd < 0) { - return -1; - } - - rc = commit_report_basic(sockfd); -- if(rc < 0){ -+ if (rc < 0) { - goto aer_fail; - } - - rc = commit_report_backtrace(sockfd, AER_EVENT, ev); -- if(rc < 0){ -+ if (rc < 0) { - goto aer_fail; - } - - sprintf(buf, "ANALYZER=%s", "rasdaemon-aer"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - goto aer_fail; - } - - sprintf(buf, "REASON=%s", "PCIe AER driver report problem"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - goto aer_fail; - } - -@@ -484,18 +495,19 @@ int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev){ - - aer_fail: - -- if(sockfd >= 0){ -+ if (sockfd >= 0) { - close(sockfd); - } - -- if(done){ -+ if (done) { - return 0; -- }else{ -+ } else { - return -1; - } - } - --int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev){ -+int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev) -+{ - char buf[MAX_MESSAGE_SIZE]; - int sockfd = 0; - int rc = -1; -@@ -503,29 +515,29 @@ int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standar - memset(buf, 0, sizeof(buf)); - - sockfd = setup_report_socket(); -- if(sockfd < 0){ -+ if (sockfd < 0) { - return rc; - } - - rc = commit_report_basic(sockfd); -- if(rc < 0){ -+ if (rc < 0) { - goto non_standard_fail; - } - - rc = commit_report_backtrace(sockfd, NON_STANDARD_EVENT, ev); -- if(rc < 0){ -+ if (rc < 0) { - goto non_standard_fail; - } - - sprintf(buf, "ANALYZER=%s", "rasdaemon-non-standard"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - goto non_standard_fail; - } - - sprintf(buf, "REASON=%s", "Unknown CPER section problem"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - goto non_standard_fail; - } - -@@ -533,14 +545,15 @@ int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standar - - non_standard_fail: - -- if(sockfd >= 0){ -+ if (sockfd >= 0) { - close(sockfd); - } - - return rc; - } - --int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev){ -+int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev) -+{ - char buf[MAX_MESSAGE_SIZE]; - int sockfd = 0; - int rc = -1; -@@ -548,29 +561,29 @@ int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev){ - memset(buf, 0, sizeof(buf)); - - sockfd = setup_report_socket(); -- if(sockfd < 0){ -+ if (sockfd < 0) { - return rc; - } - - rc = commit_report_basic(sockfd); -- if(rc < 0){ -+ if (rc < 0) { - goto arm_fail; - } - - rc = commit_report_backtrace(sockfd, ARM_EVENT, ev); -- if(rc < 0){ -+ if (rc < 0) { - goto arm_fail; - } - - sprintf(buf, "ANALYZER=%s", "rasdaemon-arm"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - goto arm_fail; - } - - sprintf(buf, "REASON=%s", "ARM CPU report problem"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - goto arm_fail; - } - -@@ -578,14 +591,15 @@ int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev){ - - arm_fail: - -- if(sockfd >= 0){ -+ if (sockfd >= 0) { - close(sockfd); - } - - return rc; - } - --int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){ -+int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev) -+{ - char buf[MAX_MESSAGE_SIZE]; - int sockfd = 0; - int done = 0; -@@ -594,29 +608,29 @@ int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){ - memset(buf, 0, sizeof(buf)); - - sockfd = setup_report_socket(); -- if(sockfd < 0){ -+ if (sockfd < 0) { - return -1; - } - - rc = commit_report_basic(sockfd); -- if(rc < 0){ -+ if (rc < 0) { - goto mce_fail; - } - - rc = commit_report_backtrace(sockfd, MCE_EVENT, ev); -- if(rc < 0){ -+ if (rc < 0) { - goto mce_fail; - } - - sprintf(buf, "ANALYZER=%s", "rasdaemon-mce"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - goto mce_fail; - } - - sprintf(buf, "REASON=%s", "Machine Check driver report problem"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - goto mce_fail; - } - -@@ -624,18 +638,19 @@ int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){ - - mce_fail: - -- if(sockfd >= 0){ -+ if (sockfd >= 0) { - close(sockfd); - } - -- if(done){ -+ if (done) { - return 0; -- }else{ -+ } else { - return -1; - } - } - --int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev){ -+int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev) -+{ - char buf[MAX_MESSAGE_SIZE]; - int sockfd = 0; - int done = 0; -@@ -644,29 +659,29 @@ int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev){ - memset(buf, 0, sizeof(buf)); - - sockfd = setup_report_socket(); -- if(sockfd < 0){ -+ if (sockfd < 0) { - return -1; - } - - rc = commit_report_basic(sockfd); -- if(rc < 0){ -+ if (rc < 0) { - goto devlink_fail; - } - - rc = commit_report_backtrace(sockfd, DEVLINK_EVENT, ev); -- if(rc < 0){ -+ if (rc < 0) { - goto devlink_fail; - } - - sprintf(buf, "ANALYZER=%s", "rasdaemon-devlink"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - goto devlink_fail; - } - - sprintf(buf, "REASON=%s", "devlink health report problem"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - goto devlink_fail; - } - -@@ -674,18 +689,19 @@ int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev){ - - devlink_fail: - -- if(sockfd >= 0){ -+ if (sockfd >= 0) { - close(sockfd); - } - -- if(done){ -+ if (done) { - return 0; -- }else{ -+ } else { - return -1; - } - } - --int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev){ -+int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) -+{ - char buf[MAX_MESSAGE_SIZE]; - int sockfd = 0; - int done = 0; -@@ -694,42 +710,42 @@ int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *e - memset(buf, 0, sizeof(buf)); - - sockfd = setup_report_socket(); -- if(sockfd < 0){ -+ if (sockfd < 0) { - return -1; - } - - rc = commit_report_basic(sockfd); -- if(rc < 0){ -+ if (rc < 0) { - goto diskerror_fail; - } - - rc = commit_report_backtrace(sockfd, DISKERROR_EVENT, ev); -- if(rc < 0){ -+ if (rc < 0) { - goto diskerror_fail; - } - - sprintf(buf, "ANALYZER=%s", "rasdaemon-diskerror"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - goto diskerror_fail; - } - - sprintf(buf, "REASON=%s", "disk I/O error"); - rc = write(sockfd, buf, strlen(buf) + 1); -- if(rc < strlen(buf) + 1){ -+ if (rc < strlen(buf) + 1) { - goto diskerror_fail; - } - - done = 1; - - diskerror_fail: -- if(sockfd >= 0){ -+ if (sockfd >= 0) { - close(sockfd); - } - -- if(done){ -+ if (done) { - return 0; -- }else{ -+ } else { - return -1; - } - } -diff --git a/rasdaemon.c b/rasdaemon.c -index 0db51c9..7a3f964 100644 ---- a/rasdaemon.c -+++ b/rasdaemon.c -@@ -34,7 +34,7 @@ - #define TOOL_DESCRIPTION "RAS daemon to log the RAS events." - #define ARGS_DOC "" - #define DISABLE "DISABLE" --char *choices_disable = NULL; -+char *choices_disable; - - const char *argp_program_version = TOOL_NAME " " VERSION; - const char *argp_program_bug_address = "Mauro Carvalho Chehab "; -@@ -129,6 +129,7 @@ int main(int argc, char *argv[]) - { - struct arguments args; - int idx = -1; -+ - choices_disable = getenv(DISABLE); - - #ifdef HAVE_MCE -@@ -179,7 +180,7 @@ int main(int argc, char *argv[]) - .children = offline_parser, - #endif - }; -- memset (&args, 0, sizeof(args)); -+ memset(&args, 0, sizeof(args)); - - user_hz = sysconf(_SC_CLK_TCK); - -@@ -208,7 +209,7 @@ int main(int argc, char *argv[]) - - openlog(TOOL_NAME, 0, LOG_DAEMON); - if (!args.foreground) -- if (daemon(0,0)) -+ if (daemon(0, 0)) - exit(EXIT_FAILURE); - - handle_ras_events(args.record_events); -diff --git a/rbtree.c b/rbtree.c -index d9b1bd4..43da434 100644 ---- a/rbtree.c -+++ b/rbtree.c -@@ -28,7 +28,8 @@ static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) - struct rb_node *right = node->rb_right; - struct rb_node *parent = rb_parent(node); - -- if ((node->rb_right = right->rb_left)) -+ node->rb_right = right->rb_left; -+ if (node->rb_right) - rb_set_parent(right->rb_left, node); - right->rb_left = node; - -@@ -40,8 +41,7 @@ static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) - parent->rb_left = right; - else - parent->rb_right = right; -- } -- else -+ } else - root->rb_node = right; - rb_set_parent(node, right); - } -@@ -51,7 +51,8 @@ static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) - struct rb_node *left = node->rb_left; - struct rb_node *parent = rb_parent(node); - -- if ((node->rb_left = left->rb_right)) -+ node->rb_left = left->rb_right; -+ if (node->rb_left) - rb_set_parent(left->rb_right, node); - left->rb_right = node; - -@@ -63,8 +64,7 @@ static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) - parent->rb_right = left; - else - parent->rb_left = left; -- } -- else -+ } else - root->rb_node = left; - rb_set_parent(node, left); - } -@@ -81,6 +81,7 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) - { - { - register struct rb_node *uncle = gparent->rb_right; -+ - if (uncle && rb_is_red(uncle)) - { - rb_set_black(uncle); -@@ -94,6 +95,7 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) - if (parent->rb_right == node) - { - struct rb_node *tmp; -+ - __rb_rotate_left(parent, root); - tmp = parent; - parent = node; -@@ -106,6 +108,7 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) - } else { - { - struct rb_node *uncle = gparent->rb_left; -+ - if (uncle && rb_is_red(uncle)) - { - rb_set_black(uncle); -@@ -119,6 +122,7 @@ void rb_insert_color(struct rb_node *node, struct rb_root *root) - if (parent->rb_left == node) - { - struct rb_node *tmp; -+ - __rb_rotate_right(parent, root); - tmp = parent; - parent = node; -@@ -157,8 +161,7 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, - rb_set_red(other); - node = parent; - parent = rb_parent(node); -- } -- else -+ } else - { - if (!other->rb_right || rb_is_black(other->rb_right)) - { -@@ -174,8 +177,7 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, - node = root->rb_node; - break; - } -- } -- else -+ } else - { - other = parent->rb_left; - if (rb_is_red(other)) -@@ -191,8 +193,7 @@ static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, - rb_set_red(other); - node = parent; - parent = rb_parent(node); -- } -- else -+ } else - { - if (!other->rb_left || rb_is_black(other->rb_left)) - { -@@ -272,8 +273,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root) - parent->rb_left = child; - else - parent->rb_right = child; -- } -- else -+ } else - root->rb_node = child; - - color: -@@ -320,7 +320,7 @@ struct rb_node *rb_next(const struct rb_node *node) - if (node->rb_right) { - node = node->rb_right; - while (node->rb_left) -- node=node->rb_left; -+ node = node->rb_left; - return (struct rb_node *)node; - } - -@@ -348,7 +348,7 @@ struct rb_node *rb_prev(const struct rb_node *node) - if (node->rb_left) { - node = node->rb_left; - while (node->rb_right) -- node=node->rb_right; -+ node = node->rb_right; - return (struct rb_node *)node; - } - --- -2.33.1 - diff --git a/1048-Do-a-coding-style-cleanup-with-regards-to-tabs-and-w.patch b/1048-Do-a-coding-style-cleanup-with-regards-to-tabs-and-w.patch deleted file mode 100644 index 0069232432fce6e71405c2692d1ad515f1286677..0000000000000000000000000000000000000000 --- a/1048-Do-a-coding-style-cleanup-with-regards-to-tabs-and-w.patch +++ /dev/null @@ -1,2343 +0,0 @@ -From 964756be1024a526dbe343bea7a161535051065b Mon Sep 17 00:00:00 2001 -From: Mauro Carvalho Chehab -Date: Tue, 11 Jun 2024 12:01:40 +0200 -Subject: [PATCH 48/85] Do a coding style cleanup with regards to tabs and - white spaces - -Use tabs instead of spaces and remove blank ending whitespaces. - -No functional changes. - -Signed-off-by: Mauro Carvalho Chehab ---- - labels/supermicro | 16 +- - util/ras-mc-ctl.in | 1765 ++++++++++++++++++++++---------------------- - 2 files changed, 890 insertions(+), 891 deletions(-) - -diff --git a/labels/supermicro b/labels/supermicro -index aea7c3c..d358bcd 100644 ---- a/labels/supermicro -+++ b/labels/supermicro -@@ -64,7 +64,7 @@ Vendor: Supermicro - P1_DIMM4B: 1.1.1; - P2_DIMM4B: 2.0.1; - P2_DIMM4B: 2.1.1; -- -+ - Model: X11DPH-i, X11DPH-T, X11DPH-TQ - P1-DIMMA1: 0.0.0; P1-DIMMA2: 0.0.1; - P1-DIMMB1: 0.1.0; -@@ -78,7 +78,7 @@ Vendor: Supermicro - P2-DIMMD1: 3.0.0; P2-DIMMD2: 3.0.1; - P2-DIMME1: 3.1.0; - P2-DIMMF1: 3.2.0; -- -+ - Model: X10DRI, X10DRI-T - P1-DIMMA1: 0.0.0; P1-DIMMA2: 0.0.1; - P1-DIMMB1: 0.1.0; P1-DIMMB2: 0.1.1; -@@ -98,7 +98,7 @@ Vendor: Supermicro - P2-DIMMF1: 1.1.0; - P2-DIMMG1: 1.2.0; - P2-DIMMH1: 1.3.0; -- -+ - Model: X11DDW-NT, X11DDW-L - P1-DIMMA1: 0.0.0; - P1-DIMMB1: 0.1.0; -@@ -112,7 +112,7 @@ Vendor: Supermicro - P2-DIMMD1: 3.0.0; - P2-DIMME1: 3.1.0; - P2-DIMMF1: 3.2.0; -- -+ - Model: X11SPM-F, X11SPM-TF, X11SPM-TPF - DIMMA1: 0.0.0; - DIMMB1: 0.1.0; -@@ -120,7 +120,7 @@ Vendor: Supermicro - DIMMD1: 1.0.0; - DIMME1: 1.1.0; - DIMMF1: 1.2.0; -- -+ - Model: B1DRi - P1_DIMMA1: 0.0.0; - P1_DIMMB1: 0.1.0; -@@ -130,13 +130,13 @@ Vendor: Supermicro - P2_DIMMF1: 1.1.0; - P2_DIMMG1: 1.2.0; - P2_DIMMH1: 1.3.0; -- -+ - Model: X11SCA, X11SCA-F - DIMMA1: 0.0.0, 0.1.0; DIMMA2: 0.2.0, 0.3.0; - DIMMB1: 0.0.1, 0.1.1; DIMMB2: 0.2.1, 0.3.1; -- -+ - Model: X11SCW-F - DIMMA1: 0.1.0; - DIMMA2: 0.0.0; - DIMMB1: 0.1.1; -- DIMMB2: 0.0.1; -\ No newline at end of file -+ DIMMB2: 0.0.1; -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 725d215..b971ddd 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -112,25 +112,25 @@ if ( $conf{opt}{mainboard} || $conf{opt}{print_labels} - get_mainboard_info(); - - if ($conf{opt}{mainboard} eq "report") { -- print "$prog: mainboard: ", -- "$conf{mainboard}{vendor} model $conf{mainboard}{model}\n"; -+ print "$prog: mainboard: ", -+ "$conf{mainboard}{vendor} model $conf{mainboard}{model}\n"; - } - - if ($conf{opt}{print_labels}) { -- print_dimm_labels (); -+ print_dimm_labels (); - - } - if ($conf{opt}{register_labels}) { -- register_dimm_labels (); -+ register_dimm_labels (); - } - if ($conf{opt}{display_memory_layout}) { -- display_memory_layout (); -+ display_memory_layout (); - } - if ($conf{opt}{guess_dimm_label}) { -- guess_dimm_label (); -+ guess_dimm_label (); - } - if ($conf{opt}{error_count}) { -- display_error_count (); -+ display_error_count (); - } - } - -@@ -184,39 +184,39 @@ sub parse_cmdline - - Getopt::Long::Configure ("bundling"); - my $rc = GetOptions ("mainboard:s" => sub { $$mref = $_[1]||"report" }, -- "help" => sub {usage (0)}, -- "quiet" => \$conf{opt}{quiet}, -- "print-labels" => \$conf{opt}{print_labels}, -- "guess-labels" => \$conf{opt}{guess_dimm_label}, -- "register-labels" => \$conf{opt}{register_labels}, -- "delay:s" => \$conf{opt}{delay}, -- "labeldb=s" => \$conf{labeldb}, -- "status" => \$conf{opt}{status}, -- "layout" => \$conf{opt}{display_memory_layout}, -- "summary" => \$conf{opt}{summary}, -- "errors" => \$conf{opt}{errors}, -- "error-count" => \$conf{opt}{error_count}, -- "vendor-errors-summary" => \$conf{opt}{vendor_errors_summary}, -- "vendor-errors" => \$conf{opt}{vendor_errors}, -- "since=s" => \$conf{opt}{since}, -- "vendor-platforms" => \$conf{opt}{vendor_platforms}, -- ); -+ "help" => sub {usage (0)}, -+ "quiet" => \$conf{opt}{quiet}, -+ "print-labels" => \$conf{opt}{print_labels}, -+ "guess-labels" => \$conf{opt}{guess_dimm_label}, -+ "register-labels" => \$conf{opt}{register_labels}, -+ "delay:s" => \$conf{opt}{delay}, -+ "labeldb=s" => \$conf{labeldb}, -+ "status" => \$conf{opt}{status}, -+ "layout" => \$conf{opt}{display_memory_layout}, -+ "summary" => \$conf{opt}{summary}, -+ "errors" => \$conf{opt}{errors}, -+ "error-count" => \$conf{opt}{error_count}, -+ "vendor-errors-summary" => \$conf{opt}{vendor_errors_summary}, -+ "vendor-errors" => \$conf{opt}{vendor_errors}, -+ "since=s" => \$conf{opt}{since}, -+ "vendor-platforms" => \$conf{opt}{vendor_platforms}, -+ ); - - usage(1) if !$rc; - - usage (0) if !grep $conf{opt}{$_}, keys %{$conf{opt}}; - - if ($conf{opt}{delay} && !$conf{opt}{register_labels}) { -- log_error ("Only use --delay with --register-labels\n"); -- exit (1); -+ log_error ("Only use --delay with --register-labels\n"); -+ exit (1); - } - - if ($conf{opt}{since}) { -- if ($conf{opt}{since} !~ /^20\d\d-[01]\d-[0-3]\d/) { -- log_error ("--since requires a date like yyyy-mm-dd where yyyy is the year, mm the month, and dd the day\n"); -- exit (1); -- } -- $conf{opt}{since} = " where timestamp>='$conf{opt}{since}'"; -+ if ($conf{opt}{since} !~ /^20\d\d-[01]\d-[0-3]\d/) { -+ log_error ("--since requires a date like yyyy-mm-dd where yyyy is the year, mm the month, and dd the day\n"); -+ exit (1); -+ } -+ $conf{opt}{since} = " where timestamp>='$conf{opt}{since}'"; - } - } - -@@ -239,14 +239,14 @@ sub print_status - { - my $status = 0; - open (MODULES, "/proc/modules") -- or die "Unable to open /proc/modules: $!\n"; -+ or die "Unable to open /proc/modules: $!\n"; - - while () { - $status = 1 if /_edac/; - } - - print "$prog: drivers ", ($status ? "are" : "not"), " loaded.\n" -- unless $conf{opt}{quiet}; -+ unless $conf{opt}{quiet}; - - return ($status); - } -@@ -256,118 +256,118 @@ sub parse_dimm_nodes - my $file = $File::Find::name; - - if (($file =~ /max_location$/)) { -- open IN, $file; -- my $location = ; -- $location =~ s/\s+$//; -- close IN; -- my @temp = split(/ /, $location); -- -- $layers[0] = "mc"; -- -- if (m,/mc/mc(\d+),) { -- $max_pos[0] = $1 if (!exists($max_pos[0]) || $1 > $max_pos[0]); -- } else { -- $max_pos[0] = 0 if (!exists($max_pos[0])); -- } -- for (my $i = 0; $i < scalar(@temp); $i += 2) { -- $layers[$i / 2 + 1] = $temp[$i]; -- $max_pos[$i / 2 + 1] = $temp[$i + 1]; -- } -- -- return; -+ open IN, $file; -+ my $location = ; -+ $location =~ s/\s+$//; -+ close IN; -+ my @temp = split(/ /, $location); -+ -+ $layers[0] = "mc"; -+ -+ if (m,/mc/mc(\d+),) { -+ $max_pos[0] = $1 if (!exists($max_pos[0]) || $1 > $max_pos[0]); -+ } else { -+ $max_pos[0] = 0 if (!exists($max_pos[0])); -+ } -+ for (my $i = 0; $i < scalar(@temp); $i += 2) { -+ $layers[$i / 2 + 1] = $temp[$i]; -+ $max_pos[$i / 2 + 1] = $temp[$i + 1]; -+ } -+ -+ return; - } - if ($file =~ /size_mb$/) { -- my $mc = $file; -- $mc =~ s,.*mc(\d+).*,$1,; -+ my $mc = $file; -+ $mc =~ s,.*mc(\d+).*,$1,; - -- my $csrow = $file; -- $csrow =~ s,.*csrow(\d+).*,$1,; -+ my $csrow = $file; -+ $csrow =~ s,.*csrow(\d+).*,$1,; - -- open IN, $file; -- my $size = ; -- close IN; -+ open IN, $file; -+ my $size = ; -+ close IN; - -- my $str_loc = join(':', $mc, $csrow); -- $csrow_size{$str_loc} = $size; -+ my $str_loc = join(':', $mc, $csrow); -+ $csrow_size{$str_loc} = $size; - -- return; -+ return; - } - if ($file =~ /location$/) { -- my $mc = $file; -- $mc =~ s,.*mc(\d+).*,$1,; -- -- my $dimm = $file; -- $dimm =~ s,.*(rank|dimm)(\d+).*,$2,; -- -- open IN, $file; -- my $location = ; -- $location =~ s/\s+$//; -- close IN; -- -- my @pos; -- -- # Get the name of the hierarchy labels -- if (!@layers) { -- my @temp = split(/ /, $location); -- $max_pos[0] = 0; -- $layers[0] = "mc"; -- for (my $i = 0; $i < scalar(@temp); $i += 2) { -- $layers[$i / 2 + 1] = $temp[$i]; -- $max_pos[$i / 2 + 1] = 0; -- } -- } -- -- my @temp = split(/ /, $location); -- for (my $i = 1; $i < scalar(@temp); $i += 2) { -- $pos[$i / 2] = $temp[$i]; -- -- if ($pos[$i / 2] > $max_pos[$i / 2 + 1]) { -- $max_pos[$i / 2 + 1] = $pos[$i / 2]; -- } -- } -- if ($mc > $max_pos[0]) { -- $max_pos[0] = $mc; -- } -- -- # Get DIMM size -- -- $file =~ s/dimm_location/size/; -- open IN, $file; -- my $size = ; -- close IN; -- -- my $str_loc = join(':', $mc, @pos); -- $dimm_size{$str_loc} = $size; -- $dimm_node{$str_loc} = $dimm; -- $file =~ s/size/dimm_label/; -- $dimm_label_file{$str_loc} = $file; -- $dimm_location{$str_loc} = $location; -- -- my $count; -- -- $file =~s/dimm_label/dimm_ce_count/; -- if (-e $file) { -- open IN, $file; -- chomp($count = ); -- close IN; -- } else { -- log_error ("dimm_ce_count not found in sysfs. Old kernel?\n"); -- exit -1; -- } -- $dimm_ce_count{$str_loc} = $count; -- -- $file =~s/dimm_ce_count/dimm_ue_count/; -- if (-e $file) { -- open IN, $file; -- chomp($count = ); -- close IN; -- } else { -- log_error ("dimm_ue_count not found in sysfs. Old kernel?\n"); -- exit -1; -- } -- $dimm_ue_count{$str_loc} = $count; -- -- return; -+ my $mc = $file; -+ $mc =~ s,.*mc(\d+).*,$1,; -+ -+ my $dimm = $file; -+ $dimm =~ s,.*(rank|dimm)(\d+).*,$2,; -+ -+ open IN, $file; -+ my $location = ; -+ $location =~ s/\s+$//; -+ close IN; -+ -+ my @pos; -+ -+ # Get the name of the hierarchy labels -+ if (!@layers) { -+ my @temp = split(/ /, $location); -+ $max_pos[0] = 0; -+ $layers[0] = "mc"; -+ for (my $i = 0; $i < scalar(@temp); $i += 2) { -+ $layers[$i / 2 + 1] = $temp[$i]; -+ $max_pos[$i / 2 + 1] = 0; -+ } -+ } -+ -+ my @temp = split(/ /, $location); -+ for (my $i = 1; $i < scalar(@temp); $i += 2) { -+ $pos[$i / 2] = $temp[$i]; -+ -+ if ($pos[$i / 2] > $max_pos[$i / 2 + 1]) { -+ $max_pos[$i / 2 + 1] = $pos[$i / 2]; -+ } -+ } -+ if ($mc > $max_pos[0]) { -+ $max_pos[0] = $mc; -+ } -+ -+ # Get DIMM size -+ -+ $file =~ s/dimm_location/size/; -+ open IN, $file; -+ my $size = ; -+ close IN; -+ -+ my $str_loc = join(':', $mc, @pos); -+ $dimm_size{$str_loc} = $size; -+ $dimm_node{$str_loc} = $dimm; -+ $file =~ s/size/dimm_label/; -+ $dimm_label_file{$str_loc} = $file; -+ $dimm_location{$str_loc} = $location; -+ -+ my $count; -+ -+ $file =~s/dimm_label/dimm_ce_count/; -+ if (-e $file) { -+ open IN, $file; -+ chomp($count = ); -+ close IN; -+ } else { -+ log_error ("dimm_ce_count not found in sysfs. Old kernel?\n"); -+ exit -1; -+ } -+ $dimm_ce_count{$str_loc} = $count; -+ -+ $file =~s/dimm_ce_count/dimm_ue_count/; -+ if (-e $file) { -+ open IN, $file; -+ chomp($count = ); -+ close IN; -+ } else { -+ log_error ("dimm_ue_count not found in sysfs. Old kernel?\n"); -+ exit -1; -+ } -+ $dimm_ue_count{$str_loc} = $count; -+ -+ return; - } - } - -@@ -376,14 +376,14 @@ sub guess_product { - my $pname = undef; - - if (open (VENDOR, "/sys/class/dmi/id/product_vendor")) { -- $pvendor = ; -- close VENDOR; -- chomp($pvendor); -+ $pvendor = ; -+ close VENDOR; -+ chomp($pvendor); - } - if (open (NAME, "/sys/class/dmi/id/product_name")) { -- $pname = ; -- close NAME; -- chomp($pname); -+ $pname = ; -+ close NAME; -+ chomp($pname); - } - - return ($pvendor, $pname); -@@ -394,11 +394,11 @@ sub get_mainboard_info { - my ($pvendor, $pname); - - if ($conf{opt}{mainboard} && $conf{opt}{mainboard} ne "report") { -- ($vendor, $model) = split (/[: ]/, $conf{opt}{mainboard}, 2); -+ ($vendor, $model) = split (/[: ]/, $conf{opt}{mainboard}, 2); - } - - if (!$vendor || !$model) { -- ($vendor, $model) = guess_vendor_model (); -+ ($vendor, $model) = guess_vendor_model (); - } - - $conf{mainboard}{vendor} = $vendor; -@@ -407,9 +407,9 @@ sub get_mainboard_info { - ($pvendor, $pname) = guess_product (); - # since product vendor is rare, use mainboard's vendor - if ($pvendor) { -- $conf{mainboard}{product_vendor} = $pvendor; -+ $conf{mainboard}{product_vendor} = $pvendor; - } else { -- $conf{mainboard}{product_vendor} = $vendor; -+ $conf{mainboard}{product_vendor} = $vendor; - } - $conf{mainboard}{product_name} = $pname if $pname; - } -@@ -427,25 +427,25 @@ sub guess_vendor_model_dmidecode { - - LINE: - while () { -- $line++; -+ $line++; - -- /^(\s*)(board|base board|system) information/i || next LINE; -- my $indent = $1; -+ /^(\s*)(board|base board|system) information/i || next LINE; -+ my $indent = $1; - my $type = $2; - -- while ( ) { -- /^(\s*)/; -- $1 lt $indent && last LINE; -- $indent = $1; -- if ($type eq "system") { -- /(?:manufacturer|vendor):\s*(.*\S)\s*/i && ( $system_vendor = $1 ); -- /product(?: name)?:\s*(.*\S)\s*/i && ( $system_model = $1 ); -- } else { -- /(?:manufacturer|vendor):\s*(.*\S)\s*/i && ( $vendor = $1 ); -- /product(?: name)?:\s*(.*\S)\s*/i && ( $model = $1 ); -- } -- last LINE if ($vendor && $model); -- } -+ while ( ) { -+ /^(\s*)/; -+ $1 lt $indent && last LINE; -+ $indent = $1; -+ if ($type eq "system") { -+ /(?:manufacturer|vendor):\s*(.*\S)\s*/i && ( $system_vendor = $1 ); -+ /product(?: name)?:\s*(.*\S)\s*/i && ( $system_model = $1 ); -+ } else { -+ /(?:manufacturer|vendor):\s*(.*\S)\s*/i && ( $vendor = $1 ); -+ /product(?: name)?:\s*(.*\S)\s*/i && ( $model = $1 ); -+ } -+ last LINE if ($vendor && $model); -+ } - } - - close (DMI); -@@ -483,18 +483,18 @@ sub parse_mainboard_config - - open (CFG, "$file") or die "Failed to read mainboard config: $file: $!\n"; - while () { -- $line++; -- chomp; # remove newline -- s/^((?:[^'"#]*(?:(['"])[^\2]*\2)*)*)#.*/$1/; # remove comments -- s/^\s+//; # remove leading space -- s/\s+$//; # remove trailing space -- next unless length; # skip blank lines -- if (my ($key, $val) = /^\s*([-\w]+)\s*=\s*(.*)/) { -- $hash{$key}{val} = $val; -- $hash{$key}{line} = $line; -- next; -- } -- return undef; -+ $line++; -+ chomp; # remove newline -+ s/^((?:[^'"#]*(?:(['"])[^\2]*\2)*)*)#.*/$1/; # remove comments -+ s/^\s+//; # remove leading space -+ s/\s+$//; # remove trailing space -+ next unless length; # skip blank lines -+ if (my ($key, $val) = /^\s*([-\w]+)\s*=\s*(.*)/) { -+ $hash{$key}{val} = $val; -+ $hash{$key}{line} = $line; -+ next; -+ } -+ return undef; - } - close (CFG) or &log_error ("close $file: $!\n"); - return \%hash; -@@ -507,16 +507,16 @@ sub guess_vendor_model { - # to get the vendor and model information. - # - if (-f $conf{mbconfig} ) { -- my $cfg = &parse_mainboard_config ($conf{mbconfig}); -+ my $cfg = &parse_mainboard_config ($conf{mbconfig}); - -- # If mainboard config file specified a script, then try to -- # run the specified script or executable: -- # -- if ($cfg->{"script"}) { -- $cfg = &parse_mainboard_config ("$cfg->{script}{val} |"); -- die "Failed to run mainboard script\n" if (!$cfg); -- } -- return ($cfg->{vendor}{val}, $cfg->{model}{val}); -+ # If mainboard config file specified a script, then try to -+ # run the specified script or executable: -+ # -+ if ($cfg->{"script"}) { -+ $cfg = &parse_mainboard_config ("$cfg->{script}{val} |"); -+ die "Failed to run mainboard script\n" if (!$cfg); -+ } -+ return ($cfg->{vendor}{val}, $cfg->{model}{val}); - } - - ($vendor, $model) = &guess_vendor_model_sysfs (); -@@ -531,10 +531,10 @@ sub guess_dimm_label { - - LINE: - while () { -- /^(\s*)memory device$/i || next LINE; -+ /^(\s*)memory device$/i || next LINE; - my ($dimm_label, $dimm_addr); - -- while () { -+ while () { - if (/^\s*(locator|bank locator)/i) { - my $indent = $1; - $indent =~ tr/A-Z/a-z/; -@@ -552,7 +552,7 @@ sub guess_dimm_label { - next LINE; - } - next LINE if (/^\s*\n/); -- } -+ } - } - - close (DMI); -@@ -568,84 +568,84 @@ sub parse_dimm_labels_file - my $num; - - open (LABELS, "$file") -- or die "Unable to open label database: $file: $!\n"; -+ or die "Unable to open label database: $file: $!\n"; - - while () { -- $line++; -- next if /^#/; -- chomp; -- s/^\s+//; -- s/\s+$//; -- next unless length; -- -- if (/vendor\s*:\s*(.*\S)\s*/i) { -- $vendor = lc $1; -- @models = (); -- @products = (); -- $num = 0; -- next; -- } -- if (/(model|board)\s*:\s*(.*)$/i) { -- !$vendor && die "$file: line $line: MB model without vendor\n"; -- @models = grep { s/\s*(.*)\s*$/$1/ } split(/[,;]+/, $2); -- @products = (); -- $num = 0; -- next; -- } -- if (/(product)\s*:\s*(.*)$/i) { -- !$vendor && die "$file: line $line: product without vendor\n"; -- @models = (); -- @products = grep { s/\s*(.*)\s*$/$1/ } split(/[,;]+/, $2); -- $num = 0; -- next; -- } -- -- # Allow multiple labels to be specified on a single line, -- # separated by ; -- for my $str (split /;/) { -- $str =~ s/^\s*(.*)\s*$/$1/; -- -- next unless (my ($label, $info) = ($str =~ /^(.*)\s*:\s*(.*)$/i)); -- -- unless ($info =~ /\d+(?:[\.\:]\d+)*/) { -- log_error ("$file: $line: Invalid syntax, ignoring: \"$_\"\n"); -- next; -- } -- -- for my $target (split (/[, ]+/, $info)) { -- my $n; -- my ($mc, $top, $mid, $low, $extra) = ($target =~ /(\d+)(?:[\.\:](\d+)){0,1}(?:[\.\:](\d+)){0,1}(?:[\.\:](\d+)){0,1}(?:[\.\:](\d+)){0,1}/); -- -- if (defined($extra)) { -- die ("Error: Only up to 3 layers are currently supported on label db \"$file\"\n"); -- return; -- } elsif (!defined($top)) { -- die ("Error: The label db \"$file\" is defining a zero-layers machine\n"); -- return; -- } else { -- $n = 3; -- if (!defined($low)) { -- $low = 0; -- $n--; -- } -- if (!defined($mid)) { -- $mid = 0; -- $n--; -- } -- map { $lh->{$vendor}{lc $_}{$mc}{$top}{$mid}{$low} = $label } -- @models; -- map { $lh_prod->{$vendor}{lc $_}{$mc}{$top}{$mid}{$low} = $label } -- @products; -- } -- if (!$num) { -- $num = $n; -- map { $num_layers->{$vendor}{lc $_} = $num } @models; -- map { $num_layers_prod->{$vendor}{lc $_} = $num } @products; -- } elsif ($num != $n) { -- die ("Error: Inconsistent number of layers at label db \"$file\"\n"); -- } -- } -- } -+ $line++; -+ next if /^#/; -+ chomp; -+ s/^\s+//; -+ s/\s+$//; -+ next unless length; -+ -+ if (/vendor\s*:\s*(.*\S)\s*/i) { -+ $vendor = lc $1; -+ @models = (); -+ @products = (); -+ $num = 0; -+ next; -+ } -+ if (/(model|board)\s*:\s*(.*)$/i) { -+ !$vendor && die "$file: line $line: MB model without vendor\n"; -+ @models = grep { s/\s*(.*)\s*$/$1/ } split(/[,;]+/, $2); -+ @products = (); -+ $num = 0; -+ next; -+ } -+ if (/(product)\s*:\s*(.*)$/i) { -+ !$vendor && die "$file: line $line: product without vendor\n"; -+ @models = (); -+ @products = grep { s/\s*(.*)\s*$/$1/ } split(/[,;]+/, $2); -+ $num = 0; -+ next; -+ } -+ -+ # Allow multiple labels to be specified on a single line, -+ # separated by ; -+ for my $str (split /;/) { -+ $str =~ s/^\s*(.*)\s*$/$1/; -+ -+ next unless (my ($label, $info) = ($str =~ /^(.*)\s*:\s*(.*)$/i)); -+ -+ unless ($info =~ /\d+(?:[\.\:]\d+)*/) { -+ log_error ("$file: $line: Invalid syntax, ignoring: \"$_\"\n"); -+ next; -+ } -+ -+ for my $target (split (/[, ]+/, $info)) { -+ my $n; -+ my ($mc, $top, $mid, $low, $extra) = ($target =~ /(\d+)(?:[\.\:](\d+)){0,1}(?:[\.\:](\d+)){0,1}(?:[\.\:](\d+)){0,1}(?:[\.\:](\d+)){0,1}/); -+ -+ if (defined($extra)) { -+ die ("Error: Only up to 3 layers are currently supported on label db \"$file\"\n"); -+ return; -+ } elsif (!defined($top)) { -+ die ("Error: The label db \"$file\" is defining a zero-layers machine\n"); -+ return; -+ } else { -+ $n = 3; -+ if (!defined($low)) { -+ $low = 0; -+ $n--; -+ } -+ if (!defined($mid)) { -+ $mid = 0; -+ $n--; -+ } -+ map { $lh->{$vendor}{lc $_}{$mc}{$top}{$mid}{$low} = $label } -+ @models; -+ map { $lh_prod->{$vendor}{lc $_}{$mc}{$top}{$mid}{$low} = $label } -+ @products; -+ } -+ if (!$num) { -+ $num = $n; -+ map { $num_layers->{$vendor}{lc $_} = $num } @models; -+ map { $num_layers_prod->{$vendor}{lc $_} = $num } @products; -+ } elsif ($num != $n) { -+ die ("Error: Inconsistent number of layers at label db \"$file\"\n"); -+ } -+ } -+ } - } - - close (LABELS) or die "Error from label db \"$file\" : $!\n"; -@@ -663,8 +663,8 @@ sub parse_dimm_labels - # well as any files under the labels dir - # - for my $file ($conf{labeldb}, <$conf{labeldir}/*>) { -- next unless -r $file; -- parse_dimm_labels_file (\%labels, \%num_layers, \%labels_prod, \%num_layers_prod, $file); -+ next unless -r $file; -+ parse_dimm_labels_file (\%labels, \%num_layers, \%labels_prod, \%num_layers_prod, $file); - } - - return (\%labels, \%num_layers, \%labels_prod, \%num_layers_prod); -@@ -681,9 +681,9 @@ sub read_dimm_label - $pos = "$mc:$top" if ($num_layers == 1); - - if (!defined($dimm_node{$pos})) { -- my $label = "$pos missing"; -- $pos = ""; -- return ($label, $pos); -+ my $label = "$pos missing"; -+ $pos = ""; -+ return ($label, $pos); - } - - my $dimm = $dimm_node{$pos}; -@@ -695,8 +695,8 @@ sub read_dimm_label - return ("label missing", "$pos missing") unless -f $dimm_label_file; - - if (!open (LABEL, "$dimm_label_file")) { -- warn "Failed to open $dimm_label_file: $!\n"; -- return ("Error"); -+ warn "Failed to open $dimm_label_file: $!\n"; -+ return ("Error"); - } - - chomp (my $label =