Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion bundle/lib/source/DobbySpecConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -635,14 +635,21 @@ bool DobbySpecConfig::parseSpec(ctemplate::TemplateDictionary* dictionary,
// step 6 - enable the RDK plugins section
dictionary->ShowSection(ENABLE_RDK_PLUGINS);

// step 6.5 - add any default plugins in the settings file
// step 6.1 - add any default plugins in the settings file
Json::Value rdkPluginData = mRdkPluginsData;
for (const auto& pluginName : mDefaultPlugins)
{
mRdkPluginsJson[pluginName]["data"] = rdkPluginData[pluginName];
mRdkPluginsJson[pluginName]["required"] = false;
}


// step 6.2 - always enable the OOMCrash plugin (unless already configured)
if (!mRdkPluginsJson.isMember("oomcrash"))
{
mRdkPluginsJson["oomcrash"]["data"] = Json::Value(Json::objectValue);
mRdkPluginsJson["oomcrash"]["required"] = false;
}
// step 7 - process RDK plugins json into dictionary
if (!processRdkPlugins(mSpec["rdkPlugins"], mDictionary))
{
Expand Down
3 changes: 0 additions & 3 deletions bundle/runtime-schemas/defs-plugins.json
Original file line number Diff line number Diff line change
Expand Up @@ -651,9 +651,6 @@
},
"data": {
"type": "object",
"required": [
"path"
],
"properties": {
"path": {
"type": "string"
Expand Down
214 changes: 173 additions & 41 deletions rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
*/

#include "OOMCrashPlugin.h"

#include <map>

#define FIREBOLT_STATE "fireboltState"
#define FIREBOLT_STATE_PREV "fireboltState_prev"
/**
* Need to do this at the start of every plugin to make sure the correct
* C methods are visible to allow PluginLauncher to find the plugin
Expand Down Expand Up @@ -70,11 +75,12 @@ bool OOMCrash::postInstallation()
return false;
}

const std::string path = mContainerConfig->rdk_plugins->oomcrash->data->path;
const char *pathPtr = mContainerConfig->rdk_plugins->oomcrash->data->path;
const std::string path = pathPtr ? pathPtr : "";
if (path.empty())
{
AI_LOG_ERROR("OOMCrash path is empty");
return false;
AI_LOG_INFO("OOMCrash path not configured, skipping mount setup for container '%s'", mUtils->getContainerId().c_str());
return true;
}

if (!mUtils->mkdirRecursive((mRootfsPath + path).c_str(), 0755) && errno != EEXIST)
Expand Down Expand Up @@ -112,23 +118,17 @@ bool OOMCrash::postHalt()
return false;
}

bool oomDetected = false;
if (mUtils->exitStatus != 0)
oomDetected = checkForOOM();
bool oomDetected = checkForOOM();

const char *pathPtr = mContainerConfig->rdk_plugins->oomcrash->data->path;
const std::string path = pathPtr ? pathPtr : "";

if (oomDetected)
if (oomDetected && !path.empty())
createFileForOOM();

// Remove the crashFile if container exits normally or if no OOM detected
if (mUtils->exitStatus == 0 || !oomDetected)
if (!path.empty() && (mUtils->exitStatus == 0 || !oomDetected))
{
std::string path = mContainerConfig->rdk_plugins->oomcrash->data->path;
if (path.empty())
{
AI_LOG_ERROR("OOMCrash path is empty");
return false;
}

std::string crashFile = path + "/oom_crashed_" + mUtils->getContainerId() + ".txt";
if (remove(crashFile.c_str()) != 0)
{
Expand Down Expand Up @@ -173,68 +173,200 @@ std::vector<std::string> OOMCrash::getDependencies() const
}

/**
* @brief Read cgroup file.
* @brief Read the oom_kill counter from the cgroup memory.oom_control file.
*
* The memory.oom_control file contains multiple key-value lines, e.g.:
*
* @param[out] val gives the number of times that the cgroup limit was exceeded.
* Kernel >= 4.13:
* oom_kill_disable 0
* under_oom 0
* oom_kill 1
*
* @return true on successfully reading from the file.
* Kernel < 4.13:
* oom_kill_disable 0
* under_oom 0
*
* On older kernels the 'oom_kill' counter does not exist, so we fall back
* to the 'under_oom' flag which is 1 while the cgroup is in OOM state.
*
* @param[out] val Set to the value of the 'oom_kill' field (or 'under_oom'
* on older kernels) on success.
*
* @return true on successfully reading and parsing the field.
*/

bool OOMCrash::readCgroup(unsigned long *val)
{
std::string path = "/sys/fs/cgroup/memory/" + mUtils->getContainerId() + "/memory.failcnt";
std::string path = "/sys/fs/cgroup/memory/" + mUtils->getContainerId() + "/memory.oom_control";

FILE *fp = fopen(path.c_str(), "r");
if (!fp)
{
if (errno != ENOENT)
AI_LOG_ERROR("failed to open '%s' (%d - %s)", path.c_str(), errno, strerror(errno));

AI_LOG_ERROR("failed to open '%s' (%d - %s)", path.c_str(), errno, strerror(errno));
return false;
}

char* line = nullptr;
size_t len = 0;
ssize_t rd;
bool foundOomKill = false;
unsigned long underOom = 0;
bool foundUnderOom = false;

if ((rd = getline(&line, &len, fp)) < 0)
while ((rd = getline(&line, &len, fp)) > 0)
{
if (line)
free(line);
fclose(fp);
AI_LOG_ERROR("failed to read cgroup file line (%d - %s)", errno, strerror(errno));
return false;
unsigned long v;
// sscanf won't match "oom_kill_disable" because the space in the
// format requires whitespace where "_disable" has an underscore.
if (sscanf(line, "oom_kill %lu", &v) == 1)
{
*val = v;
foundOomKill = true;
break;
}
if (sscanf(line, "under_oom %lu", &v) == 1)
{
underOom = v;
foundUnderOom = true;
}
}

*val = strtoul(line, nullptr, 0);

if (line)
free(line);
fclose(fp);
free(line);

return true;
// Prefer oom_kill (kernel >= 4.13); fall back to under_oom for older kernels
if (foundOomKill)
return true;

if (foundUnderOom)
{
AI_LOG_INFO("'oom_kill' field not present (kernel < 4.13), using 'under_oom' fallback");
*val = underOom;
return true;
}

AI_LOG_ERROR("neither 'oom_kill' nor 'under_oom' found in '%s'", path.c_str());
return false;
}

/**
* @brief Check if memory (or memory+swap) max usage reached the configured
* limit, indicating the container hit its memory ceiling.
*
* This is used as a fallback OOM indicator on older kernels (< 4.13) where
* the oom_kill counter does not exist and under_oom is transient.
* memory.max_usage_in_bytes is the high-water mark and persists until the
* cgroup is destroyed.
*
* @return true if max usage >= limit for memory or memory+swap.
*/
bool OOMCrash::isMemoryAtLimit()
{
std::string basePath = "/sys/fs/cgroup/memory/" + mUtils->getContainerId();

const char *pairs[][2] = {
{ "/memory.max_usage_in_bytes", "/memory.limit_in_bytes" },
{ "/memory.memsw.max_usage_in_bytes", "/memory.memsw.limit_in_bytes" },
};

for (const auto &pair : pairs)
{
unsigned long maxUsage = 0, limit = 0;
std::string maxPath = basePath + pair[0];
std::string limPath = basePath + pair[1];

FILE *fpMax = fopen(maxPath.c_str(), "r");
FILE *fpLim = fopen(limPath.c_str(), "r");

bool ok = (fpMax && fpLim &&
fscanf(fpMax, "%lu", &maxUsage) == 1 &&
fscanf(fpLim, "%lu", &limit) == 1);

if (fpMax) fclose(fpMax);
if (fpLim) fclose(fpLim);

if (ok && limit > 0 && maxUsage >= limit)
{
AI_LOG_INFO("%s=%lu reached %s=%lu", pair[0]+1, maxUsage, pair[1]+1, limit);
return true;
}
}

return false;
}

/**
* @brief Check for Out of Memory by reading cgroup file.
* @brief Check for Out of Memory by reading cgroup files.
*
* Detection priority:
* 1. oom_kill > 0 (kernel >= 4.13, definitive)
* 2. under_oom > 0 (kernel < 4.13, transient flag)
* 3. max_usage_in_bytes >= limit (all kernels, persistent high-water mark)
*
* @return true if OOM detected.
*/

bool OOMCrash::checkForOOM()
{
unsigned long failCnt;
bool status;
if (readCgroup(&failCnt) && (failCnt > 0))
unsigned long oomKill = 0;
bool cgroupRead = readCgroup(&oomKill);

// Priority 1 & 2: oom_kill or under_oom confirmed OOM
if (cgroupRead && oomKill > 0)
{
AI_LOG_WARN("memory allocation failure detected in %s container, likely OOM (failcnt = %lu)", mUtils->getContainerId().c_str(), failCnt);
status = true;
AI_LOG_INFO("oom_control reports OOM (value=%lu) for container '%s'",
oomKill, mUtils->getContainerId().c_str());
}
// Priority 3: on kernel < 4.13 under_oom may have cleared — check max_usage
else if (isMemoryAtLimit())
{
AI_LOG_WARN("oom_control did not confirm OOM but max memory usage reached limit "
Comment on lines +321 to +324
"for container '%s'", mUtils->getContainerId().c_str());
}
else
{
AI_LOG_WARN("No OOM failure detected in %s container", mUtils->getContainerId().c_str());
status = false;
AI_LOG_INFO("No OOM kill detected in container '%s'", mUtils->getContainerId().c_str());
return false;
}
return status;

// OOM kill confirmed - retrieve firebolt state from annotations.
// AppService often transitions the app to "background" after the OOM kill
// but before postHalt runs. Since the container exited abnormally, prefer
// the previous fireboltState value (which was the state at the time of the
// actual OOM kill) over the current value which may have been overwritten
// by a post-crash transition.
std::map<std::string, std::string> annotations = mUtils->getAnnotations();
std::string fireboltState;

auto prevIt = annotations.find(FIREBOLT_STATE_PREV);
if (prevIt != annotations.end())
{
fireboltState = prevIt->second;
AI_LOG_INFO("Using previous fireboltState '%s' (current may have been "
"set after OOM kill)", fireboltState.c_str());
Comment on lines +342 to +347
}
else
{
auto it = annotations.find(FIREBOLT_STATE);
if (it != annotations.end())
{
fireboltState = it->second;
}
}

if (!fireboltState.empty())
{
AI_LOG_WARN("OOM kill detected: container '%s' fireboltState '%s'",
mUtils->getContainerId().c_str(), fireboltState.c_str());
}
else
{
AI_LOG_WARN("OOM kill detected: container '%s' (firebolt state unknown)",
mUtils->getContainerId().c_str());
}

return true;
}

/**
Expand Down
1 change: 1 addition & 0 deletions rdkPlugins/OOMCrash/source/OOMCrashPlugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class OOMCrash : public RdkPluginBase

private:
bool readCgroup(unsigned long *val);
bool isMemoryAtLimit();
bool checkForOOM();
void createFileForOOM();

Expand Down
Loading