-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Enable Fatal error monitoring and CPER file creation
APML RAS Manager Initialization - Added initialization for APML RAS Manager. - Included conditional compilation for APML support. - Added a placeholder error log for PLDM RAS capabilities, indicating that they are yet to be enabled. - The init function repeatedly attempts to get the BMC RAS OOB configuration until successful. - The function initializes the platform with the block ID's that needs to be harvested during a crashdump and sets up a D-Bus match to monitor watchdog state changes to monitor BIOS post complete. - It reads CPU IDs for all CPUs and logs errors on failure. - The function also handles BIOS post-completion by configuring PCIE OOB settings and enabling PCIE error thresholds based on watchdog timer changes. - It also clears SbrmiAlertMask register so APML_ALERT_L will be asserted during a syncflood in the system. - The commit has oem_cper.h providing the outline of file format for both runtime and crashdump CPER records. - Added additional json config parameters to enable OOB registers during initialization. - Overall , this commit provides all the necessary preps needed to enable the crashdump flow and runtime error monitoring. Crashdump monitoring: - This commit introduces the handling of GPIO events for P0 and P1 APML alerts - Binds the P0 alert event handler and P1 alert evernt handler to manage these alerts. - Read RAS status register and check for errors. - Log and send alerts for various RAS errors including: - SYS_MGMT_CTRL_ERR: Trigger cold reset based on policy. - RESET_HANG_ERR: Suggest manual immediate reset. - FATAL_ERROR: Harvest MCA data and reset based on policy. - MCA_ERR_OVERFLOW: Log MCA runtime error counter overflow. - DRAM_CECC_ERR_OVERFLOW: Log DRAM CECC runtime error counter overflow. - PCIE_ERR_OVERFLOW: Log PCIE runtime error counter overflow. CPER record generation: - Add functionality to generate Common Platform Error Record (CPER) entries when a FATAL_ERROR is detected. - The system stores maximum of 10 CPER records in BMC. - Create D-Bus object paths for each CPER file in the system, allowing for download via redfish. - Update properties for each CPER object, including Filename, Log, and Timestamp. root@morocco-d89c:~# busctl tree com.amd.RAS `- /com `- /com/amd `- /com/amd/RAS `- /com/amd/RAS/0 Signed-off-by: aasboddu <[email protected]>, Abinaya Dhandapani <[email protected]>
- Loading branch information
Showing
22 changed files
with
4,006 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,267 @@ | ||
#include "error_monitor.hpp" | ||
|
||
extern "C" | ||
{ | ||
#include "apml.h" | ||
#include "apml_common.h" | ||
#include "esmi_cpuid_msr.h" | ||
#include "esmi_mailbox.h" | ||
#include "esmi_rmi.h" | ||
} | ||
#include <boost/asio/deadline_timer.hpp> | ||
#include <boost/asio/posix/stream_descriptor.hpp> | ||
#include <gpiod.hpp> | ||
|
||
namespace amd | ||
{ | ||
namespace ras | ||
{ | ||
namespace apml | ||
{ | ||
class Manager : public amd::ras::Manager | ||
{ | ||
public: | ||
Manager() = delete; | ||
~Manager() = default; | ||
Manager(const Manager&) = delete; | ||
Manager& operator=(const Manager&) = delete; | ||
Manager(Manager&&) = delete; | ||
Manager& operator=(Manager&&) = delete; | ||
|
||
Manager(amd::ras::config::Manager& manager, | ||
sdbusplus::asio::object_server& objectServer, | ||
std::shared_ptr<sdbusplus::asio::connection>& systemBus, | ||
boost::asio::io_context& io) : | ||
amd::ras::Manager(manager), p0apmlAlertEvent(io), p1apmlAlertEvent(io), | ||
objectServer(objectServer), systemBus(systemBus), io(io) | ||
{} | ||
|
||
virtual void init(); | ||
|
||
virtual void configure(); | ||
|
||
/** | ||
* @brief Requests GPIO events for hardware alert handling. | ||
* | ||
* This function configures a GPIO line and stream descriptor to listen for | ||
* events. It triggers the provided callback function upon event detection. | ||
* | ||
* @param[in] gpioPin The GPIO pin to monitor. | ||
* @param[in] callback The function to call when an event is detected. | ||
* @param[in] line The GPIO line to use for event detection. | ||
* @param[in] stream The stream descriptor used to listen for events. | ||
*/ | ||
void requestGPIOEvents(const std::string&, const std::function<void()>&, | ||
gpiod::line&, | ||
boost::asio::posix::stream_descriptor&); | ||
|
||
/** | ||
* @brief Handler for P0 alert events. | ||
* | ||
* This function is invoked when an alert event occurs on P0. The function | ||
* handles the event by processing the necessary response. | ||
*/ | ||
void p0AlertEventHandler(); | ||
|
||
/** | ||
* @brief Handler for P1 alert events. | ||
* | ||
* This function is invoked when an alert event occurs on P1. The function | ||
* handles the event by processing the necessary response. | ||
*/ | ||
void p1AlertEventHandler(); | ||
|
||
/** | ||
* @brief GPIO line for handling P0 alert events. | ||
* | ||
* This GPIO line is used to detect hardware alerts for P0 and trigger | ||
* events for processing. | ||
*/ | ||
gpiod::line p0apmlAlertLine; | ||
|
||
/** | ||
* @brief GPIO line for handling P1 alert events. | ||
* | ||
* This GPIO line is used to detect hardware alerts for P1 and trigger | ||
* events for processing. | ||
*/ | ||
gpiod::line p1apmlAlertLine; | ||
|
||
/** @brief Stream descriptor for handling P0 APML alert events. | ||
* | ||
* @details This stream descriptor listens for alert events related to the | ||
* P0 sensor and triggers actions upon detection. | ||
*/ | ||
boost::asio::posix::stream_descriptor p0apmlAlertEvent; | ||
|
||
/** @brief Stream descriptor for handling P1 APML alert events. | ||
* | ||
* @details This stream descriptor listens for alert events related to the | ||
* P1 sensor and triggers actions upon detection. | ||
*/ | ||
boost::asio::posix::stream_descriptor p1apmlAlertEvent; | ||
|
||
private: | ||
sdbusplus::asio::object_server& objectServer; | ||
std::shared_ptr<sdbusplus::asio::connection>& systemBus; | ||
|
||
uint8_t progId = 1; | ||
uint64_t recordId = 1; | ||
uint8_t watchdogTimerCounter = 0; | ||
boost::asio::io_context& io; | ||
std::mutex harvestMutex; | ||
std::vector<uint8_t> blockId; | ||
bool apmlInitialized = false; | ||
bool platformInitialized = false; | ||
bool p0AlertProcessed = false; | ||
bool p1AlertProcessed = false; | ||
std::mutex mcaErrorHarvestMtx; | ||
std::mutex dramErrorHarvestMtx; | ||
std::mutex pcieErrorHarvestMtx; | ||
|
||
/** @brief Update processor OOB configuration. | ||
* | ||
* @details This API updates processor OOB configuration | ||
* for MCA, DRAM and PCIe with the user input. | ||
* | ||
* @param[in] oob_config_d_in - oob configuration data containing | ||
* mca_oob_misc0_ec_enable, dram_cecc_oob_ec_mode, | ||
* dram_cecc_leak_rate, pcie_err_reporting_en, | ||
* pcie_ue_oob_counter_en and core_mca_err_reporting_en. | ||
* | ||
* @return OOB_SUCCESS is returned upon successful call. | ||
* @return APML_ERR error code is returned upon failure. | ||
*/ | ||
oob_status_t setRasOobConfig(struct oob_config_d_in); | ||
|
||
/** @brief Get processor OOB configuration. | ||
* | ||
* @details This API reads processor OOB configuration | ||
* for MCA, DRAM and PCIe. | ||
* | ||
* @param[out] oob_config_d_in - oob configuration data containing | ||
* mca_oob_misc0_ec_enable, dram_cecc_oob_ec_mode, | ||
* dram_cecc_leak_rate, pcie_err_reporting_en, | ||
* pcie_ue_oob_counter_en and core_mca_err_reporting_en. | ||
* | ||
* @return OOB_SUCCESS is returned upon successful call. | ||
* @return APML_ERR error code is returned upon failure. | ||
*/ | ||
oob_status_t getRasOobConfig(struct oob_config_d_in*); | ||
|
||
/** @brief Set PCIe OOB error reporting. | ||
* | ||
* @details This API enables OOB configuration for PCIe | ||
* based on PcieAerPollingEn attribute in rasConfigTable. | ||
* | ||
* @return OOB_SUCCESS is returned upon successful call. | ||
* @return APML_ERR error code is returned upon failure. | ||
*/ | ||
oob_status_t setPcieOobConfig(); | ||
|
||
/** @brief Update PCIe OOB configuration. | ||
* | ||
* @details This API updates PCIe OOB registers and enables | ||
* PCIe OOB error reporting. | ||
* | ||
* @return OOB_SUCCESS is returned upon successful call. | ||
* @return APML_ERR error code is returned upon failure. | ||
*/ | ||
oob_status_t setPcieOobRegisters(); | ||
|
||
/** @brief Set RAS error threshold configuration. | ||
* | ||
* @details This API updates RAS error thresholds for | ||
* MCA, DRAM and PCIe with the user input. | ||
* | ||
* @param[in] run_time_threshold - runtime threshold configuration | ||
* containing error type [00(MCA), 01(DRAM CECC), 10(PCIE_UE), | ||
* 11(PCIE_CE)], error count threshold and max interrupt rate. | ||
* | ||
* @return OOB_SUCCESS is returned upon successful call. | ||
* @return APML_ERR error code is returned upon failure. | ||
*/ | ||
oob_status_t setRasErrThreshold(struct run_time_threshold); | ||
|
||
/** @brief Set PCIe error threshold configuration. | ||
* | ||
* @details This API enables PCIe error thresholds | ||
* based on PcieAerThresholdEn attribute in rasConfigTable. | ||
* | ||
* @return OOB_SUCCESS is returned upon successful call. | ||
* @return APML_ERR error code is returned upon failure. | ||
*/ | ||
oob_status_t setPcieErrThreshold(); | ||
|
||
/** @brief Clear the SBRMI alert mask bit. | ||
* | ||
* @details Clears alert mask bit in SBRMI control register for the given | ||
* SOC socket number. | ||
* | ||
* @param[in] socNum - Socket number of the processor. | ||
*/ | ||
void clearSbrmiAlertMask(uint8_t socNum); | ||
|
||
/** @brief Monitors the current host power state. | ||
* | ||
* @details This API monitors the current host power state using | ||
* xyz.openbmc_project.State.Host D-bus Interface. | ||
*/ | ||
void currentHostStateMonitor(); | ||
|
||
/** @brief Initializes platform-specific settings. | ||
* | ||
* @details It initializes the platform based on the family ID. Block ID's | ||
* are selected based on the platform that needs to be harvested during a | ||
* crashdump. It also invokes clearSbrmiAlertMask() API to clear | ||
* Sbrmi::AlertMask bit | ||
*/ | ||
void platformInitialize(); | ||
|
||
/** @brief decodes the APML_ALERT_L assertion cause by checking RAS status | ||
* register. | ||
* | ||
* It reads RAS status register and check if the APML assertion is due to | ||
* Fatal error or runtime error overflow and takes the necessary actions. | ||
*/ | ||
bool decodeInterrupt(uint8_t); | ||
|
||
/** @brief Check the validity of MCA banks. | ||
* | ||
* This function performs a validity check on the MCA banks. | ||
* returns a boolean indicating whether the MCA banks are valid. | ||
*/ | ||
bool harvestMcaValidityCheck(uint8_t, uint16_t*, uint16_t*); | ||
|
||
/** @brief Checks the validity of runtime errors. */ | ||
oob_status_t runTimeErrValidityCheck(uint8_t, struct ras_rt_err_req_type, | ||
struct ras_rt_valid_err_inst*); | ||
|
||
/** @brief Harvests runtime errors. */ | ||
void harvestRuntimeErrors(uint8_t, struct ras_rt_valid_err_inst, | ||
struct ras_rt_valid_err_inst); | ||
|
||
void runTimeErrorInfoCheck(uint8_t, uint8_t); | ||
/** @brief Harvests MCA data banks. */ | ||
void harvestMcaDataBanks(uint8_t, uint16_t, uint16_t); | ||
|
||
/** @brief Retrieves the last transaction address. */ | ||
void getLastTransAddr(const std::shared_ptr<FatalCperRecord>&, uint8_t); | ||
|
||
/** @brief Harvests debug log ID dump data */ | ||
void harvestDebugLogDump(const std::shared_ptr<FatalCperRecord>&, uint8_t, | ||
uint8_t, int64_t*, uint16_t&); | ||
/** @brief Dumps the processor error section of the CPER record*/ | ||
template <typename T> | ||
void dumpProcErrorSection(const std::shared_ptr<T>& data, uint8_t soc_num, | ||
struct ras_rt_valid_err_inst inst, | ||
uint8_t category, uint16_t Section, | ||
uint32_t* Severity, uint64_t* CheckInfo); | ||
|
||
/** @brief Harvests DRAM CECC error counters */ | ||
void harvestDramCeccErrorCounters(struct ras_rt_valid_err_inst, uint8_t); | ||
}; | ||
|
||
} // namespace apml | ||
} // namespace ras | ||
} // namespace amd |
Oops, something went wrong.