feat: implement alert correlation and context building for enhanced alert analysis

yllada · yllada · commit c59d0004dec8 · 2025-04-28T13:40:31.000-04:00
diff --git a/soc-ai/configurations/const.go b/soc-ai/configurations/const.go
@@ -59,8 +59,9 @@ var (
 		"email": {Regexp: `([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})`, FakeValue: "jhondoe@gmail.com"},
 		//"ipv4":  `(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)`,
 	}
-	GPT_INSTRUCTION    = "You are an expert security engineer. Perform a deep analysis of an alert created by a SIEM and the logs related to it. Determine if the alert could be an actual potential threat or not and explain why. Provide a description that shows a deep understanding of the alert based on a deep analysis of its logs and estimate the risk to the systems affected. Classify the alert in the following manner: if the alert information is sufficient to determine that the security, availability, confidentiality, or integrity of the systems has being compromised, then classify it as \"possible incident\". If the alert does not pose a security risk to the organization or has no security relevance, classify it as \"possible false positive\". If the alert does not pose an imminent risk to the systems, requires no urgent action from an administrator, or requires not urgent review by an administrator, it should be classified as a \"standard alert\". You will also provide context-specific instructions for remediation, mitigation, or further investigation, related to the alert and logs analyzed. Your answer should be provided using the following JSON format and the total number of characters in your answer must not exceed 1500 words. Your entire answer must be inside this json format. {\"activity_id\":\"<activity_id>\",\"classification\":\"<classification>\",\"reasoning\":[\"<deep_reasoning>\"],\"nextSteps\":[{\"step\":1,\"action\":\"<action_1>\",\"details\":\"<action_1_details>\"},{\"step\":2,\"action\":\"<action_2>\",\"details\":\"<action_2_details>\"},{\"step\":3,\"action\":\"<action_3>\"]}Ensure that your entire answer adheres to the provided JSON format. The response should be valid JSON syntax and schema."
-	GPT_FALSE_POSITIVE = "This alert is categorized as a potential false positive due to two key factors. Firstly, it originates from an automated system, which may occasionally produce alerts without direct human validation. Additionally, the absence of any correlated logs further raises suspicion, as a genuine incident typically leaves a trail of relevant log entries. Hence, the combination of its system-generated nature and the lack of associated logs suggests a likelihood of being a false positive rather than a genuine security incident."
+	GPT_INSTRUCTION     = "You are an expert security engineer. Perform a deep analysis of an alert created by a SIEM and the logs related to it. Determine if the alert could be an actual potential threat or not and explain why. Provide a description that shows a deep understanding of the alert based on a deep analysis of its logs and estimate the risk to the systems affected. Classify the alert in the following manner: if the alert information is sufficient to determine that the security, availability, confidentiality, or integrity of the systems has being compromised, then classify it as \"possible incident\". If the alert does not pose a security risk to the organization or has no security relevance, classify it as \"possible false positive\". If the alert does not pose an imminent risk to the systems, requires no urgent action from an administrator, or requires not urgent review by an administrator, it should be classified as a \"standard alert\". You will also provide context-specific instructions for remediation, mitigation, or further investigation, related to the alert and logs analyzed. Your answer should be provided using the following JSON format and the total number of characters in your answer must not exceed 1500 words. Your entire answer must be inside this json format. {\"activity_id\":\"<activity_id>\",\"classification\":\"<classification>\",\"reasoning\":[\"<deep_reasoning>\"],\"nextSteps\":[{\"step\":1,\"action\":\"<action_1>\",\"details\":\"<action_1_details>\"},{\"step\":2,\"action\":\"<action_2>\",\"details\":\"<action_2_details>\"},{\"step\":3,\"action\":\"<action_3>\"]}Ensure that your entire answer adheres to the provided JSON format. The response should be valid JSON syntax and schema."
+	GPT_FALSE_POSITIVE  = "This alert is categorized as a potential false positive due to two key factors. Firstly, it originates from an automated system, which may occasionally produce alerts without direct human validation. Additionally, the absence of any correlated logs further raises suspicion, as a genuine incident typically leaves a trail of relevant log entries. Hence, the combination of its system-generated nature and the lack of associated logs suggests a likelihood of being a false positive rather than a genuine security incident."
+	CORRELATION_CONTEXT = "\n\nAlert Context: The current alert has historical correlation with previous alerts:\n%s"
 )
 
 func GetInternalKey() string {
diff --git a/soc-ai/elastic/alerts.go b/soc-ai/elastic/alerts.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"net/http"
+	"strings"
 
 	"github.com/utmstack/soc-ai/configurations"
 	"github.com/utmstack/soc-ai/schema"
@@ -65,3 +66,141 @@ func ChangeAlertStatus(id string, status int, observations string) error {
 
 	return nil
 }
+
+type AlertCorrelation struct {
+	CurrentAlert    schema.Alert
+	RelatedAlerts   []schema.Alert
+	Classifications []string
+}
+
+func GetRelatedAlerts() ([]schema.GPTAlertResponse, error) {
+	result, err := ElasticSearch(configurations.ALERT_INDEX_PATTERN, "*", "*")
+	if err != nil {
+		return nil, fmt.Errorf("error getting historical alerts: %v", err)
+	}
+
+	var alerts []schema.GPTAlertResponse
+	err = json.Unmarshal(result, &alerts)
+	if err != nil {
+		return nil, fmt.Errorf("error unmarshalling alerts: %v", err)
+	}
+
+	return alerts, nil
+}
+
+func FindRelatedAlerts(currentAlert schema.Alert) (*AlertCorrelation, error) {
+	correlation := &AlertCorrelation{
+		CurrentAlert:    currentAlert,
+		RelatedAlerts:   []schema.Alert{},
+		Classifications: []string{},
+	}
+
+	historicalResponses, err := GetRelatedAlerts()
+	if err != nil {
+		return nil, err
+	}
+
+	var alertIDs []string
+	for _, resp := range historicalResponses {
+		alertIDs = append(alertIDs, resp.ActivityID)
+	}
+
+	for _, id := range alertIDs {
+		alert, err := GetAlertsInfo(id)
+		if err != nil {
+			continue
+		}
+
+		if isAlertRelated(currentAlert, alert) {
+			correlation.RelatedAlerts = append(correlation.RelatedAlerts, alert)
+
+			for _, resp := range historicalResponses {
+				if resp.ActivityID == alert.ID {
+					correlation.Classifications = append(correlation.Classifications, resp.Classification)
+					break
+				}
+			}
+		}
+	}
+
+	return correlation, nil
+}
+
+func isAlertRelated(current, historical schema.Alert) bool {
+	if current.Destination.IP != "" && current.Destination.IP == historical.Destination.IP {
+		return true
+	}
+	if current.Destination.Port != 0 && current.Destination.Port == historical.Destination.Port {
+		return true
+	}
+	if current.Destination.Host != "" && current.Destination.Host == historical.Destination.Host {
+		return true
+	}
+	if current.Destination.User != "" && current.Destination.User == historical.Destination.User {
+		return true
+	}
+
+	if current.Source.IP != "" && current.Source.IP == historical.Source.IP {
+		return true
+	}
+	if current.Source.Port != 0 && current.Source.Port == historical.Source.Port {
+		return true
+	}
+	if current.Source.Host != "" && current.Source.Host == historical.Source.Host {
+		return true
+	}
+	if current.Source.User != "" && current.Source.User == historical.Source.User {
+		return true
+	}
+
+	return false
+}
+
+func BuildCorrelationContext(correlation *AlertCorrelation) string {
+	var context strings.Builder
+
+	context.WriteString("\nHistorical Context:\n")
+	context.WriteString(fmt.Sprintf("Found %d related alerts with similar characteristics:\n", len(correlation.RelatedAlerts)))
+
+	for i, alert := range correlation.RelatedAlerts {
+		context.WriteString(fmt.Sprintf("\nRelated Alert %d:\n", i+1))
+		context.WriteString(fmt.Sprintf("- Name: %s\n", alert.Name))
+		context.WriteString(fmt.Sprintf("- Severity: %s\n", alert.SeverityLabel))
+		context.WriteString(fmt.Sprintf("- Category: %s\n", alert.Category))
+		context.WriteString(fmt.Sprintf("- Classification: %s\n", correlation.Classifications[i]))
+		context.WriteString(fmt.Sprintf("- Time: %s\n", alert.Timestamp))
+
+		if alert.Source.IP != "" {
+			context.WriteString(fmt.Sprintf("- Source IP: %s\n", alert.Source.IP))
+		}
+		if alert.Destination.IP != "" {
+			context.WriteString(fmt.Sprintf("- Destination IP: %s\n", alert.Destination.IP))
+		}
+		if alert.Source.Host != "" {
+			context.WriteString(fmt.Sprintf("- Source Host: %s\n", alert.Source.Host))
+		}
+		if alert.Destination.Host != "" {
+			context.WriteString(fmt.Sprintf("- Destination Host: %s\n", alert.Destination.Host))
+		}
+		if alert.Source.User != "" {
+			context.WriteString(fmt.Sprintf("- Source User: %s\n", alert.Source.User))
+		}
+		if alert.Destination.User != "" {
+			context.WriteString(fmt.Sprintf("- Destination User: %s\n", alert.Destination.User))
+		}
+		if alert.Source.Port != 0 {
+			context.WriteString(fmt.Sprintf("- Source Port: %d\n", alert.Source.Port))
+		}
+		if alert.Destination.Port != 0 {
+			context.WriteString(fmt.Sprintf("- Destination Port: %d\n", alert.Destination.Port))
+		}
+		if alert.Protocol != "" {
+			context.WriteString(fmt.Sprintf("- Protocol: %s\n", alert.Protocol))
+		}
+		if alert.Severity != 0 {
+			context.WriteString(fmt.Sprintf("- Severity: %d\n", alert.Severity))
+		}
+	}
+
+	return context.String()
+}
diff --git a/soc-ai/gpt/client.go b/soc-ai/gpt/client.go
@@ -3,6 +3,7 @@ package gpt
 import (
 	"encoding/json"
 	"fmt"
+	"strings"
 	"sync"
 
 	"github.com/utmstack/soc-ai/configurations"
@@ -26,6 +27,15 @@ func GetGPTClient() *GPTClient {
 
 func (c *GPTClient) Request(alert schema.AlertGPTDetails) (string, error) {
 	content := configurations.GPT_INSTRUCTION
+
+	if alert.Description != "" {
+		correlationContext := strings.Split(alert.Description, "\nHistorical Context:")
+		if len(correlationContext) > 1 {
+			content = fmt.Sprintf("%s%s",
+				content, fmt.Sprintf(configurations.CORRELATION_CONTEXT, correlationContext[1]))
+		}
+	}
+
 	if alert.Logs == "" || alert.Logs == " " {
 		content += content + ". " + configurations.GPT_FALSE_POSITIVE
 	}
diff --git a/soc-ai/processor/alertProcessor.go b/soc-ai/processor/alertProcessor.go
@@ -5,6 +5,7 @@ import (
 
 	"github.com/utmstack/soc-ai/elastic"
 	"github.com/utmstack/soc-ai/schema"
+	"github.com/utmstack/soc-ai/utils"
 )
 
 func (p *Processor) processAlertsInfo() {
@@ -15,7 +16,18 @@ func (p *Processor) processAlertsInfo() {
 			continue
 		}
 
+		correlation, err := elastic.FindRelatedAlerts(alertInfo)
+		if err != nil {
+			utils.Logger.ErrorF("error finding related alerts: %v", err)
+		}
+
 		details := schema.ConvertFromAlertToAlertDB(alertInfo)
+
+		if correlation != nil && len(correlation.RelatedAlerts) > 0 {
+			correlationContext := elastic.BuildCorrelationContext(correlation)
+			details.Description = details.Description + "\n\n" + correlationContext
+		}
+
 		p.GPTQueue <- cleanAlerts(&details)
 	}
 }

Original file line number	Diff line number	Diff line change
`@@ -59,8 +59,9 @@ var (`
`59`	`59`	"email": {Regexp: `([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})`, FakeValue: "jhondoe@gmail.com"},
`60`	`60`	//"ipv4": `(?:(?:25[0-5]\|2[0-4][0-9]\|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]\|2[0-4][0-9]\|[01]?[0-9][0-9]?)`,
`61`	`61`	`}`
`62`		- GPT_INSTRUCTION = "You are an expert security engineer. Perform a deep analysis of an alert created by a SIEM and the logs related to it. Determine if the alert could be an actual potential threat or not and explain why. Provide a description that shows a deep understanding of the alert based on a deep analysis of its logs and estimate the risk to the systems affected. Classify the alert in the following manner: if the alert information is sufficient to determine that the security, availability, confidentiality, or integrity of the systems has being compromised, then classify it as \"possible incident\". If the alert does not pose a security risk to the organization or has no security relevance, classify it as \"possible false positive\". If the alert does not pose an imminent risk to the systems, requires no urgent action from an administrator, or requires not urgent review by an administrator, it should be classified as a \"standard alert\". You will also provide context-specific instructions for remediation, mitigation, or further investigation, related to the alert and logs analyzed. Your answer should be provided using the following JSON format and the total number of characters in your answer must not exceed 1500 words. Your entire answer must be inside this json format. {\"activity_id\":\"<activity_id>\",\"classification\":\"<classification>\",\"reasoning\":[\"<deep_reasoning>\"],\"nextSteps\":[{\"step\":1,\"action\":\"<action_1>\",\"details\":\"<action_1_details>\"},{\"step\":2,\"action\":\"<action_2>\",\"details\":\"<action_2_details>\"},{\"step\":3,\"action\":\"<action_3>\"]}Ensure that your entire answer adheres to the provided JSON format. The response should be valid JSON syntax and schema."
`63`		- GPT_FALSE_POSITIVE = "This alert is categorized as a potential false positive due to two key factors. Firstly, it originates from an automated system, which may occasionally produce alerts without direct human validation. Additionally, the absence of any correlated logs further raises suspicion, as a genuine incident typically leaves a trail of relevant log entries. Hence, the combination of its system-generated nature and the lack of associated logs suggests a likelihood of being a false positive rather than a genuine security incident."
	`62`	+ GPT_INSTRUCTION = "You are an expert security engineer. Perform a deep analysis of an alert created by a SIEM and the logs related to it. Determine if the alert could be an actual potential threat or not and explain why. Provide a description that shows a deep understanding of the alert based on a deep analysis of its logs and estimate the risk to the systems affected. Classify the alert in the following manner: if the alert information is sufficient to determine that the security, availability, confidentiality, or integrity of the systems has being compromised, then classify it as \"possible incident\". If the alert does not pose a security risk to the organization or has no security relevance, classify it as \"possible false positive\". If the alert does not pose an imminent risk to the systems, requires no urgent action from an administrator, or requires not urgent review by an administrator, it should be classified as a \"standard alert\". You will also provide context-specific instructions for remediation, mitigation, or further investigation, related to the alert and logs analyzed. Your answer should be provided using the following JSON format and the total number of characters in your answer must not exceed 1500 words. Your entire answer must be inside this json format. {\"activity_id\":\"<activity_id>\",\"classification\":\"<classification>\",\"reasoning\":[\"<deep_reasoning>\"],\"nextSteps\":[{\"step\":1,\"action\":\"<action_1>\",\"details\":\"<action_1_details>\"},{\"step\":2,\"action\":\"<action_2>\",\"details\":\"<action_2_details>\"},{\"step\":3,\"action\":\"<action_3>\"]}Ensure that your entire answer adheres to the provided JSON format. The response should be valid JSON syntax and schema."
	`63`	+ GPT_FALSE_POSITIVE = "This alert is categorized as a potential false positive due to two key factors. Firstly, it originates from an automated system, which may occasionally produce alerts without direct human validation. Additionally, the absence of any correlated logs further raises suspicion, as a genuine incident typically leaves a trail of relevant log entries. Hence, the combination of its system-generated nature and the lack of associated logs suggests a likelihood of being a false positive rather than a genuine security incident."
	`64`	`+ CORRELATION_CONTEXT = "\n\nAlert Context: The current alert has historical correlation with previous alerts:\n%s"`
`64`	`65`	`)`
`65`	`66`
`66`	`67`	`func GetInternalKey() string {`
Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,7 @@ import (`
`5`	`5`
`6`	`6`	`"github.com/utmstack/soc-ai/elastic"`
`7`	`7`	`"github.com/utmstack/soc-ai/schema"`
	`8`	`+ "github.com/utmstack/soc-ai/utils"`
`8`	`9`	`)`
`9`	`10`
`10`	`11`	`func (p *Processor) processAlertsInfo() {`
`@@ -15,7 +16,18 @@ func (p *Processor) processAlertsInfo() {`
`15`	`16`	`continue`
`16`	`17`	`}`
`17`	`18`
	`19`	`+ correlation, err := elastic.FindRelatedAlerts(alertInfo)`
	`20`	`+ if err != nil {`
	`21`	`+ utils.Logger.ErrorF("error finding related alerts: %v", err)`
	`22`	`+ }`
	`23`	`+`
`18`	`24`	`details := schema.ConvertFromAlertToAlertDB(alertInfo)`
	`25`	`+`
	`26`	`+ if correlation != nil && len(correlation.RelatedAlerts) > 0 {`
	`27`	`+ correlationContext := elastic.BuildCorrelationContext(correlation)`
	`28`	`+ details.Description = details.Description + "\n\n" + correlationContext`
	`29`	`+ }`
	`30`	`+`
`19`	`31`	`p.GPTQueue <- cleanAlerts(&details)`
`20`	`32`	`}`
`21`	`33`	`}`