From 3613597f67b19d3719161a344eb3fd8d89200695 Mon Sep 17 00:00:00 2001 From: ks734 Date: Tue, 5 May 2026 10:35:19 +0000 Subject: [PATCH] RDKEMW-16955: Log firebolt state of the app in OOMCrash plugin --- bundle/lib/source/DobbySpecConfig.cpp | 9 +- bundle/runtime-schemas/defs-plugins.json | 3 - rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp | 214 ++++++++++++++---- rdkPlugins/OOMCrash/source/OOMCrashPlugin.h | 1 + 4 files changed, 182 insertions(+), 45 deletions(-) diff --git a/bundle/lib/source/DobbySpecConfig.cpp b/bundle/lib/source/DobbySpecConfig.cpp index 5d85f526b..8b35bcd42 100644 --- a/bundle/lib/source/DobbySpecConfig.cpp +++ b/bundle/lib/source/DobbySpecConfig.cpp @@ -635,7 +635,7 @@ bool DobbySpecConfig::parseSpec(ctemplate::TemplateDictionary* dictionary, // step 6 - enable the RDK plugins section dictionary->ShowSection(ENABLE_RDK_PLUGINS); - // step 6.5 - add any default plugins in the settings file + // step 6.1 - add any default plugins in the settings file Json::Value rdkPluginData = mRdkPluginsData; for (const auto& pluginName : mDefaultPlugins) { @@ -643,6 +643,13 @@ bool DobbySpecConfig::parseSpec(ctemplate::TemplateDictionary* dictionary, mRdkPluginsJson[pluginName]["required"] = false; } + + // step 6.2 - always enable the OOMCrash plugin (unless already configured) + if (!mRdkPluginsJson.isMember("oomcrash")) + { + mRdkPluginsJson["oomcrash"]["data"] = Json::Value(Json::objectValue); + mRdkPluginsJson["oomcrash"]["required"] = false; + } // step 7 - process RDK plugins json into dictionary if (!processRdkPlugins(mSpec["rdkPlugins"], mDictionary)) { diff --git a/bundle/runtime-schemas/defs-plugins.json b/bundle/runtime-schemas/defs-plugins.json index ac029e737..9c4aa1c40 100644 --- a/bundle/runtime-schemas/defs-plugins.json +++ b/bundle/runtime-schemas/defs-plugins.json @@ -651,9 +651,6 @@ }, "data": { "type": "object", - "required": [ - "path" - ], "properties": { "path": { "type": "string" diff --git a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp index 7eaa84f9a..030699094 100644 --- a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp +++ b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp @@ -18,6 +18,11 @@ */ #include "OOMCrashPlugin.h" + +#include + +#define FIREBOLT_STATE "fireboltState" +#define FIREBOLT_STATE_PREV "fireboltState_prev" /** * Need to do this at the start of every plugin to make sure the correct * C methods are visible to allow PluginLauncher to find the plugin @@ -70,11 +75,12 @@ bool OOMCrash::postInstallation() return false; } - const std::string path = mContainerConfig->rdk_plugins->oomcrash->data->path; + const char *pathPtr = mContainerConfig->rdk_plugins->oomcrash->data->path; + const std::string path = pathPtr ? pathPtr : ""; if (path.empty()) { - AI_LOG_ERROR("OOMCrash path is empty"); - return false; + AI_LOG_INFO("OOMCrash path not configured, skipping mount setup for container '%s'", mUtils->getContainerId().c_str()); + return true; } if (!mUtils->mkdirRecursive((mRootfsPath + path).c_str(), 0755) && errno != EEXIST) @@ -112,23 +118,17 @@ bool OOMCrash::postHalt() return false; } - bool oomDetected = false; - if (mUtils->exitStatus != 0) - oomDetected = checkForOOM(); + bool oomDetected = checkForOOM(); + + const char *pathPtr = mContainerConfig->rdk_plugins->oomcrash->data->path; + const std::string path = pathPtr ? pathPtr : ""; - if (oomDetected) + if (oomDetected && !path.empty()) createFileForOOM(); // Remove the crashFile if container exits normally or if no OOM detected - if (mUtils->exitStatus == 0 || !oomDetected) + if (!path.empty() && (mUtils->exitStatus == 0 || !oomDetected)) { - std::string path = mContainerConfig->rdk_plugins->oomcrash->data->path; - if (path.empty()) - { - AI_LOG_ERROR("OOMCrash path is empty"); - return false; - } - std::string crashFile = path + "/oom_crashed_" + mUtils->getContainerId() + ".txt"; if (remove(crashFile.c_str()) != 0) { @@ -173,68 +173,200 @@ std::vector OOMCrash::getDependencies() const } /** - * @brief Read cgroup file. + * @brief Read the oom_kill counter from the cgroup memory.oom_control file. + * + * The memory.oom_control file contains multiple key-value lines, e.g.: * - * @param[out] val gives the number of times that the cgroup limit was exceeded. + * Kernel >= 4.13: + * oom_kill_disable 0 + * under_oom 0 + * oom_kill 1 * - * @return true on successfully reading from the file. + * Kernel < 4.13: + * oom_kill_disable 0 + * under_oom 0 + * + * On older kernels the 'oom_kill' counter does not exist, so we fall back + * to the 'under_oom' flag which is 1 while the cgroup is in OOM state. + * + * @param[out] val Set to the value of the 'oom_kill' field (or 'under_oom' + * on older kernels) on success. + * + * @return true on successfully reading and parsing the field. */ bool OOMCrash::readCgroup(unsigned long *val) { - std::string path = "/sys/fs/cgroup/memory/" + mUtils->getContainerId() + "/memory.failcnt"; + std::string path = "/sys/fs/cgroup/memory/" + mUtils->getContainerId() + "/memory.oom_control"; FILE *fp = fopen(path.c_str(), "r"); if (!fp) { - if (errno != ENOENT) - AI_LOG_ERROR("failed to open '%s' (%d - %s)", path.c_str(), errno, strerror(errno)); - + AI_LOG_ERROR("failed to open '%s' (%d - %s)", path.c_str(), errno, strerror(errno)); return false; } char* line = nullptr; size_t len = 0; ssize_t rd; + bool foundOomKill = false; + unsigned long underOom = 0; + bool foundUnderOom = false; - if ((rd = getline(&line, &len, fp)) < 0) + while ((rd = getline(&line, &len, fp)) > 0) { - if (line) - free(line); - fclose(fp); - AI_LOG_ERROR("failed to read cgroup file line (%d - %s)", errno, strerror(errno)); - return false; + unsigned long v; + // sscanf won't match "oom_kill_disable" because the space in the + // format requires whitespace where "_disable" has an underscore. + if (sscanf(line, "oom_kill %lu", &v) == 1) + { + *val = v; + foundOomKill = true; + break; + } + if (sscanf(line, "under_oom %lu", &v) == 1) + { + underOom = v; + foundUnderOom = true; + } } - *val = strtoul(line, nullptr, 0); - + if (line) + free(line); fclose(fp); - free(line); - return true; + // Prefer oom_kill (kernel >= 4.13); fall back to under_oom for older kernels + if (foundOomKill) + return true; + + if (foundUnderOom) + { + AI_LOG_INFO("'oom_kill' field not present (kernel < 4.13), using 'under_oom' fallback"); + *val = underOom; + return true; + } + + AI_LOG_ERROR("neither 'oom_kill' nor 'under_oom' found in '%s'", path.c_str()); + return false; +} + +/** + * @brief Check if memory (or memory+swap) max usage reached the configured + * limit, indicating the container hit its memory ceiling. + * + * This is used as a fallback OOM indicator on older kernels (< 4.13) where + * the oom_kill counter does not exist and under_oom is transient. + * memory.max_usage_in_bytes is the high-water mark and persists until the + * cgroup is destroyed. + * + * @return true if max usage >= limit for memory or memory+swap. + */ +bool OOMCrash::isMemoryAtLimit() +{ + std::string basePath = "/sys/fs/cgroup/memory/" + mUtils->getContainerId(); + + const char *pairs[][2] = { + { "/memory.max_usage_in_bytes", "/memory.limit_in_bytes" }, + { "/memory.memsw.max_usage_in_bytes", "/memory.memsw.limit_in_bytes" }, + }; + + for (const auto &pair : pairs) + { + unsigned long maxUsage = 0, limit = 0; + std::string maxPath = basePath + pair[0]; + std::string limPath = basePath + pair[1]; + + FILE *fpMax = fopen(maxPath.c_str(), "r"); + FILE *fpLim = fopen(limPath.c_str(), "r"); + + bool ok = (fpMax && fpLim && + fscanf(fpMax, "%lu", &maxUsage) == 1 && + fscanf(fpLim, "%lu", &limit) == 1); + + if (fpMax) fclose(fpMax); + if (fpLim) fclose(fpLim); + + if (ok && limit > 0 && maxUsage >= limit) + { + AI_LOG_INFO("%s=%lu reached %s=%lu", pair[0]+1, maxUsage, pair[1]+1, limit); + return true; + } + } + + return false; } /** - * @brief Check for Out of Memory by reading cgroup file. + * @brief Check for Out of Memory by reading cgroup files. + * + * Detection priority: + * 1. oom_kill > 0 (kernel >= 4.13, definitive) + * 2. under_oom > 0 (kernel < 4.13, transient flag) + * 3. max_usage_in_bytes >= limit (all kernels, persistent high-water mark) * * @return true if OOM detected. */ bool OOMCrash::checkForOOM() { - unsigned long failCnt; - bool status; - if (readCgroup(&failCnt) && (failCnt > 0)) + unsigned long oomKill = 0; + bool cgroupRead = readCgroup(&oomKill); + + // Priority 1 & 2: oom_kill or under_oom confirmed OOM + if (cgroupRead && oomKill > 0) { - AI_LOG_WARN("memory allocation failure detected in %s container, likely OOM (failcnt = %lu)", mUtils->getContainerId().c_str(), failCnt); - status = true; + AI_LOG_INFO("oom_control reports OOM (value=%lu) for container '%s'", + oomKill, mUtils->getContainerId().c_str()); + } + // Priority 3: on kernel < 4.13 under_oom may have cleared — check max_usage + else if (isMemoryAtLimit()) + { + AI_LOG_WARN("oom_control did not confirm OOM but max memory usage reached limit " + "for container '%s'", mUtils->getContainerId().c_str()); } else { - AI_LOG_WARN("No OOM failure detected in %s container", mUtils->getContainerId().c_str()); - status = false; + AI_LOG_INFO("No OOM kill detected in container '%s'", mUtils->getContainerId().c_str()); + return false; } - return status; + + // OOM kill confirmed - retrieve firebolt state from annotations. + // AppService often transitions the app to "background" after the OOM kill + // but before postHalt runs. Since the container exited abnormally, prefer + // the previous fireboltState value (which was the state at the time of the + // actual OOM kill) over the current value which may have been overwritten + // by a post-crash transition. + std::map annotations = mUtils->getAnnotations(); + std::string fireboltState; + + auto prevIt = annotations.find(FIREBOLT_STATE_PREV); + if (prevIt != annotations.end()) + { + fireboltState = prevIt->second; + AI_LOG_INFO("Using previous fireboltState '%s' (current may have been " + "set after OOM kill)", fireboltState.c_str()); + } + else + { + auto it = annotations.find(FIREBOLT_STATE); + if (it != annotations.end()) + { + fireboltState = it->second; + } + } + + if (!fireboltState.empty()) + { + AI_LOG_WARN("OOM kill detected: container '%s' fireboltState '%s'", + mUtils->getContainerId().c_str(), fireboltState.c_str()); + } + else + { + AI_LOG_WARN("OOM kill detected: container '%s' (firebolt state unknown)", + mUtils->getContainerId().c_str()); + } + + return true; } /** diff --git a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.h b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.h index b43a16327..fd3ed42e1 100644 --- a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.h +++ b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.h @@ -57,6 +57,7 @@ class OOMCrash : public RdkPluginBase private: bool readCgroup(unsigned long *val); + bool isMemoryAtLimit(); bool checkForOOM(); void createFileForOOM();