diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6aa1821 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +input_data/* +output_data/* +pickles/* +__pycache__/* +.idea/* \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/AfterImage.py b/AfterImage.py index 4f359f7..6c4388a 100644 --- a/AfterImage.py +++ b/AfterImage.py @@ -3,7 +3,7 @@ class incStat: - def __init__(self, Lambda, ID, init_time=0, isTypeDiff=False): # timestamp is creation time + def __init__(self, Lambda, ID, init_time=0, isTypeDiff=False, tcpFlags=False): # timestamp is creation time self.ID = ID self.CF1 = 0 # linear sum self.CF2 = 0 # sum of squares @@ -15,8 +15,28 @@ def __init__(self, Lambda, ID, init_time=0, isTypeDiff=False): # timestamp is c self.cur_var = np.nan self.cur_std = np.nan self.covs = [] # a list of incStat_covs (references) with relate to this incStat + self.tcpPkts = 0 + self.flag_counts = { + "FIN": 0, + "SYN": 0, + "RST": 0, + "PSH": 0, + "ACK": 0, + "URG": 0, + "ECE": 0, + "CWR": 0 + } + + def insert(self, v, t=0, tcpFlags=False): # v is a scalar, t is v's arrival the timestamp + if tcpFlags: + self.tcpPkts += 1 + flag_int = int(tcpFlags, 16) # Convert hex string to integer + flags = ["FIN", "SYN", "RST", "PSH", "ACK", "URG", "ECE", "CWR"] + for i, flag in enumerate(flags): + if flag_int & (1 << i): # Check if the flag is set + self.flag_counts[flag] += 1 + return True - def insert(self, v, t=0): # v is a scalar, t is v's arrival the timestamp if self.isTypeDiff: dif = t - self.lastTimestamp if dif > 0: @@ -98,9 +118,13 @@ def magnitude(self, other_incStats): # the magnitude of a set of incStats return math.sqrt(A) #calculates and pulls all stats on this stream - def allstats_1D(self): + def allstats_1D(self, tcpFlags=False): self.cur_mean = self.CF1 / self.w self.cur_var = abs(self.CF2 / self.w - math.pow(self.cur_mean, 2)) + # Return mean of tcp flags + if tcpFlags: + flags = [flag / self.tcpPkts for flag in list(self.flag_counts.values())] + return flags return [self.w, self.cur_mean, self.cur_var] #calculates and pulls all stats on this stream, and stats shared with the indicated stream @@ -264,7 +288,6 @@ def get_lambda(self,Lambda): def register(self,ID,Lambda=1,init_time=0,isTypeDiff=False): #Default Lambda? Lambda = self.get_lambda(Lambda) - #Retrieve incStat key = ID+"_"+str(Lambda) incS = self.HT.get(key) @@ -298,9 +321,9 @@ def register_cov(self,ID1,ID2,Lambda=1,init_time=0,isTypeDiff=False): return inc_cov # updates/registers stream - def update(self,ID,t,v,Lambda=1,isTypeDiff=False): + def update(self,ID,t,v,Lambda=1,isTypeDiff=False,tcpFlags=False): incS = self.register(ID,Lambda,t,isTypeDiff) - incS.insert(v,t) + incS.insert(v,t,tcpFlags=tcpFlags) return incS # Pulls current stats from the given ID @@ -369,9 +392,9 @@ def get_nD_Stats(self,IDs,Lambda=1): #radius, magnitude (IDs is a list) return [np.sqrt(rad),np.sqrt(mag)] # Updates and then pulls current 1D stats from the given ID. Automatically registers previously unknown stream IDs - def update_get_1D_Stats(self, ID,t,v,Lambda=1,isTypeDiff=False): # weight, mean, std - incS = self.update(ID,t,v,Lambda,isTypeDiff) - return incS.allstats_1D() + def update_get_1D_Stats(self, ID,t,v,Lambda=1,isTypeDiff=False, tcpFlags=False): # weight, mean, std + incS = self.update(ID,t,v,Lambda,isTypeDiff, tcpFlags=tcpFlags) + return incS.allstats_1D(tcpFlags) # Updates and then pulls current correlative stats between the given IDs. Automatically registers previously unknown stream IDs, and cov tracking @@ -439,4 +462,3 @@ def cleanOutOldRecords(self,cutoffWeight,curTime): elif W > cutoffWeight: break return n - diff --git a/FeatureExtractor.py b/FeatureExtractor.py index da3eca1..cb27035 100644 --- a/FeatureExtractor.py +++ b/FeatureExtractor.py @@ -17,6 +17,7 @@ import os.path import platform import subprocess +import csv #Extracts Kitsune features from given pcap file one packet at a time using "get_next_vector()" @@ -63,7 +64,6 @@ def __prep__(self): ##If file is TSV (pre-parsed by wireshark script) if type == "tsv": self.parse_type = "tsv" - ##If file is pcap elif type == "pcap" or type == 'pcapng': # Try parsing via tshark dll of wireshark (faster) @@ -106,7 +106,7 @@ def __prep__(self): self.limit = len(self.scapyin) print("Loaded " + str(len(self.scapyin)) + " Packets.") - def get_next_vector(self): + def get_next_vector(self, single=False): if self.curPacketIndx == self.limit: if self.parse_type == 'tsv': self.tsvinf.close() @@ -120,6 +120,10 @@ def get_next_vector(self): framelen = row[1] srcIP = '' dstIP = '' + tcpFlags = '' + tcpFlags = row[19] + payload = '' + #payload = int(row[20])+int(row[21]) if row[4] != '': # IPv4 srcIP = row[4] dstIP = row[5] @@ -128,8 +132,7 @@ def get_next_vector(self): srcIP = row[17] dstIP = row[18] IPtype = 1 - srcproto = row[6] + row[ - 8] # UDP or TCP port: the concatenation of the two port strings will will results in an OR "[tcp|udp]" + srcproto = row[6] + row[8] # UDP or TCP port: the concatenation of the two port strings will will results in an OR "[tcp|udp]" dstproto = row[7] + row[9] # UDP or TCP port srcMAC = row[2] dstMAC = row[3] @@ -147,7 +150,6 @@ def get_next_vector(self): elif srcIP + srcproto + dstIP + dstproto == '': # some other protocol srcIP = row[2] # src MAC dstIP = row[3] # dst MAC - elif self.parse_type == "scapy": packet = self.scapyin[self.curPacketIndx] IPtype = np.nan @@ -195,13 +197,14 @@ def get_next_vector(self): return [] self.curPacketIndx = self.curPacketIndx + 1 - + if not single: + tcpFlags = False ### Extract Features try: return self.nstat.updateGetStats(IPtype, srcMAC, dstMAC, srcIP, srcproto, dstIP, dstproto, int(framelen), - float(timestamp)) + float(timestamp), tcpFlags, payload) except Exception as e: print(e) return [] @@ -209,10 +212,36 @@ def get_next_vector(self): def pcap2tsv_with_tshark(self): print('Parsing with tshark...') - fields = "-e frame.time_epoch -e frame.len -e eth.src -e eth.dst -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport -e icmp.type -e icmp.code -e arp.opcode -e arp.src.hw_mac -e arp.src.proto_ipv4 -e arp.dst.hw_mac -e arp.dst.proto_ipv4 -e ipv6.src -e ipv6.dst" + fields = "-e frame.time_epoch -e frame.len -e eth.src -e eth.dst -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport -e icmp.type -e icmp.code -e arp.opcode -e arp.src.hw_mac -e arp.src.proto_ipv4 -e arp.dst.hw_mac -e arp.dst.proto_ipv4 -e ipv6.src -e ipv6.dst -e tcp.flags -e tcp.len -e udp.length -e http.response.code" cmd = '"' + self._tshark + '" -r '+ self.path +' -T fields '+ fields +' -E header=y -E occurrence=f > '+self.path+".tsv" subprocess.call(cmd,shell=True) print("tshark parsing complete. File saved as: "+self.path +".tsv") def get_num_features(self): return len(self.nstat.getNetStatHeaders()) + + def get_all_vectors(self, csv_path=False, single=False): + vectorList = [] + if csv_path: + with open(csv_path, mode='w', newline='') as csv_file: + csv_writer = csv.writer(csv_file) + while True: + if self.curPacketIndx % 100000 == 0: + print(self.curPacketIndx) + vector = self.get_next_vector(single) + if len(vector) == 0 or self.curPacketIndx > self.limit: + self.curPacketIndx = 0 + return csv_path + else: + csv_writer.writerow(vector) + else: + while True: + if self.curPacketIndx % 1000 == 0: + print(self.curPacketIndx) + vector = self.get_next_vector() + if len(vector) == 0 or self.curPacketIndx > self.limit: + self.curPacketIndx = 0 + return vectorList + else: + vectorList.append(vector) + diff --git a/KitNET/KitNET.py b/KitNET/KitNET.py index f839cfa..79ed1e7 100644 --- a/KitNET/KitNET.py +++ b/KitNET/KitNET.py @@ -17,7 +17,7 @@ class KitNET: #feature_map: One may optionally provide a feature map instead of learning one. The map must be a list, # where the i-th entry contains a list of the feature indices to be assingned to the i-th autoencoder in the ensemble. # For example, [[2,5,3],[4,0,1],[6,7]] - def __init__(self,n,max_autoencoder_size=10,FM_grace_period=None,AD_grace_period=10000,learning_rate=0.1,hidden_ratio=0.75, feature_map = None): + def __init__(self,n,max_autoencoder_size=10,FM_grace_period=None,AD_grace_period=10000,learning_rate=0.1,hidden_ratio=0.75, feature_map=None): # Parameters: self.AD_grace_period = AD_grace_period if FM_grace_period is None: @@ -50,7 +50,8 @@ def __init__(self,n,max_autoencoder_size=10,FM_grace_period=None,AD_grace_period #Note: KitNET automatically performs 0-1 normalization on all attributes. def process(self,x): if self.n_trained > self.FM_grace_period + self.AD_grace_period: #If both the FM and AD are in execute-mode - return self.execute(x) + result = self.execute(x) + return result else: self.train(x) return 0.0 @@ -104,6 +105,16 @@ def __createAD__(self): params = AE.dA_params(len(self.v), n_hidden=0, lr=self.lr, corruption_level=0, gracePeriod=0, hiddenRatio=self.hr) self.outputLayer = AE.dA(params) + def process_batch(self, data): + resultList = [] + count = 0 + for instance in data: + if count % 1000 == 0: + print("processing packet ", count, " / ", len(data)) + resultList.append(self.process(instance)) + count += 1 + return np.array(resultList) + # Copyright (c) 2017 Yisroel Mirsky # # MIT License diff --git a/KitNET/__pycache__/KitNET.cpython-39.pyc b/KitNET/__pycache__/KitNET.cpython-39.pyc new file mode 100644 index 0000000..545f645 Binary files /dev/null and b/KitNET/__pycache__/KitNET.cpython-39.pyc differ diff --git a/KitNET/__pycache__/__init__.cpython-39.pyc b/KitNET/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..1b92442 Binary files /dev/null and b/KitNET/__pycache__/__init__.cpython-39.pyc differ diff --git a/KitNET/__pycache__/corClust.cpython-39.pyc b/KitNET/__pycache__/corClust.cpython-39.pyc new file mode 100644 index 0000000..5ac12dc Binary files /dev/null and b/KitNET/__pycache__/corClust.cpython-39.pyc differ diff --git a/KitNET/__pycache__/dA.cpython-39.pyc b/KitNET/__pycache__/dA.cpython-39.pyc new file mode 100644 index 0000000..264747c Binary files /dev/null and b/KitNET/__pycache__/dA.cpython-39.pyc differ diff --git a/KitNET/__pycache__/utils.cpython-39.pyc b/KitNET/__pycache__/utils.cpython-39.pyc new file mode 100644 index 0000000..4dfa4c7 Binary files /dev/null and b/KitNET/__pycache__/utils.cpython-39.pyc differ diff --git a/KitPlugin.py b/KitPlugin.py new file mode 100644 index 0000000..1d7bb83 --- /dev/null +++ b/KitPlugin.py @@ -0,0 +1,1544 @@ +import csv +import math +import os + +import openpyxl +import pandas as pd +from matplotlib import pyplot as plt +from openpyxl.chart import BarChart, Reference +from optuna_dashboard import run_server + +from Kitsune import Kitsune +from KitNET.KitNET import KitNET +import netStat as ns +import shap +import numpy as np +import pickle +from openpyxl import Workbook +from openpyxl.styles import PatternFill +from datetime import datetime, timedelta +import sklearn +import optuna +from scipy.stats import norm +import random +from scapy.all import PcapReader, PcapWriter, wrpcap, rdpcap, IP, TCP, UDP + +# Class that provides a callable interface for Kitsune components. +# Note that this approach nullifies the "incremental" aspect of Kitsune and significantly slows it down. +class KitPlugin: + # Function used by SHAP as callback to test instances of features + def kitsune_model(self, input_data): + prediction = self.K.feed_batch(input_data) + return prediction + + def kitnet_model(self, input_data): + prediction = self.KitTest.process_batch(input_data) + return prediction + + # Builds a Kitsune instance. Does not train KitNET yet. + def __init__(self, input_path=None, packet_limit=None, num_autenc=None, FMgrace=None, ADgrace=None, learning_rate=0.1, hidden_ratio=0.75): + # This code will be removed when batch running Kitsune has been finalized + if input_path != None and num_autenc != None: + self.features_list = None + self.explainer = None + self.shap_values = None + self.K = Kitsune(input_path, packet_limit, num_autenc, FMgrace, ADgrace, learning_rate, hidden_ratio) + self.metadata = { + "filename" : input_path, + "packet_limit" : packet_limit, + "num_autenc" : num_autenc, + "FMgrace": FMgrace, + "ADgrace" : ADgrace, + "timestamp" : datetime.now().strftime("%d/%m/%Y %H:%M:%S") + } + self.testFeatures = None + maxHost = 100000000000 + maxSess = 100000000000 + self.nstat = ns.netStat(np.nan, maxHost, maxSess) + + # Calls Kitsune's get_feature_list function to build the list of features + def feature_builder(self, csv=False, single=False): + print("Building features") + # Dummy-running Kitsune to get a list of features + self.features_list = self.K.get_feature_list(csv, single) + return self.features_list + + # Loads Kitsune's feature list from a pickle file + def feature_loader(self, newpickle=None): + print("Loading features from file") + path = 'pickles/featureList.pkl' + if newpickle != None: + path = newpickle + with open(path, 'rb') as f: + features_list = pickle.load(f) + self.features_list = features_list + + # Writes Kitsune's feature list to a pickle file + def feature_pickle(self, newpickle=None): + print("Writing features to file") + path = 'pickles/featureList.pkl' + if newpickle != None: + path = newpickle + with open(path, 'wb') as f: + pickle.dump(self.features_list, f) + + # Trains KitNET, using the specified index range of this class' feature list + def kit_trainer(self, min_index, max_index): + print("Training") + self.K.feed_batch(self.features_list[min_index:max_index]) + print("Training finished") + + # Trains KitNET, using a supplied feature list + def kit_trainer_supplied_features(self, features_list): + print("Training") + self.K.feed_batch(features_list) + print("Training finished") + + # Runs KitNET, using specified index range of this class' feature list + def kit_runner(self, min_index, max_index, normalize=False): + print("Running") + print(len(self.features_list[min_index:max_index])) + return self.K.feed_batch(self.features_list[min_index:max_index]) + + # Calculates KitNET's SHAP-values for the specified indexes + def shap_values_builder(self, min_train, max_train, min_test, max_test): + self.metadata['min_train'] = min_train + self.metadata['max_train'] = max_train + self.metadata['min_test'] = min_test + self.metadata['max_test'] = max_test + print("Building SHAP explainer") + self.explainer = shap.Explainer(self.kitsune_model, np.array(self.features_list[min_train:max_train])) + print("Calculating SHAP values") + if self.testFeatures != None: + self.shap_values = self.explainer.shap_values(np.array(self.testFeatures[min_test:max_test])) + else: + self.shap_values = self.explainer.shap_values(np.array(self.features_list[min_test:max_test])) + return self.shap_values + + # Writes the SHAP-values to a pickle-file + def shap_values_pickle(self, newpickle=None): + path = 'pickles/shap_values.pkl' + if newpickle != None: + path = newpickle + with open(path, 'wb') as f: + pickle.dump(self.shap_values, f) + + # Gets the SHAP-values from a pickle-file + def shap_values_loader(self, newpickle=None): + path = 'pickles/shap_values.pkl' + if newpickle != None: + path = newpickle + with open(path, 'rb') as f: + self.shap_values = pickle.load(f) + return self.shap_values + + # Calculates summary statistics of SHAP-values + def shap_stats_summary_builder(self, min_index, max_index, plot_type="dot"): + return shap.summary_plot(self.shap_values, np.array(self.features_list[min_index:max_index]), plot_type=plot_type) + + # Creates an Excel-file containing summary statistics for each feature + def shap_stats_excel_export(self, path=None): + self.workbook = openpyxl.load_workbook('input_data/template_statistics_file.xlsx') + self.create_sheet("malicious_shap") + excel_file = "summary_statistics_test.xlsx" + if path != None: + excel_file = path + self.workbook.save(excel_file) + print('done') + + # Calculates the three best and worst values for all statistics + def get_high_low_indices(self): + shap_transposed = self.shap_values.T + # List of statistics functions + stat_functions = { + 'mean': np.mean, + 'median': np.median, + 'std_dev': np.std, + 'variance': np.var, + 'minimum': np.min, + 'maximum': np.max, + 'total_sum': np.sum + } + + # Dictionary to store results + result_dict = {} + + # Loop over statistics + for stat_name, stat_func in stat_functions.items(): + # Calculate the statistic for each list + stat_values = stat_func(shap_transposed, axis=1) + + # Calculate the indices of the highest and lowest values + sorted_indices = np.argsort(stat_values) + highest_indices = sorted_indices[-3:] + lowest_indices = sorted_indices[:3] + # Store the indices in the result dictionary + result_dict[stat_name] = { + 'highest_indices': highest_indices, + 'lowest_indices': lowest_indices + } + return result_dict + + def create_histogram(self, day, featuremean, featuremedian, sheet_title): + # Extract keys and values from dictionaries + keys = list(featuremean.keys()) + values_mean = list(featuremean.values()) + values_median = list(featuremedian.values()) + + # Set up the figure and axis + fig, ax = plt.subplots() + + # Set bar width + bar_width = 0.35 + + # Set the bar positions + index = np.arange(len(keys)) + + # Plot bars for featuremean + bar1 = ax.bar(index, values_mean, bar_width, color='blue', label='Feature Mean') + + # Plot bars for featuremedian + bar2 = ax.bar(index + bar_width, values_median, bar_width, color='orange', label='Feature Median') + + # Set labels and title + ax.set_xlabel('Feature') + plt.xticks(rotation=45) + plt.tight_layout() + ax.set_ylabel('SHAP-value') + ax.set_title(f"{day}: {sheet_title}") + ax.set_xticks(index + bar_width / 2) + ax.set_xticklabels(keys) + + # Add legend + ax.legend() + plt.savefig(f'output_data/attack_types/{day}_{sheet_title}') + # Show the plot + plt.show() + + def create_histogram_to_sheet_lambda(self, day, featuremean, featuremedian, sheet_title, worksheet, col): + # Extract keys and values from dictionaries + keys = list(featuremean.keys()) + values_mean = list(featuremean.values()) + values_median = list(featuremedian.values()) + + # Create a bar chart + chart = BarChart() + chart.title = f"{day}: {sheet_title}" + chart.x_axis.title = 'Feature' + chart.y_axis.title = 'SHAP-value' + + data_ref = Reference(worksheet, min_col=2, min_row=103, max_col=3, max_row=103 + len(keys)) + categories_ref = Reference(worksheet, min_col=1, min_row=103, max_row=103 + len(keys)) + chart.add_data(data_ref, titles_from_data=True) + chart.set_categories(categories_ref) + + # # Set custom bar titles for the first and second bars (mean and median) + # chart.series[0].title = "Mean" + # chart.series[1].title = "Median" + + # Calculate the row number for placing the chart + chart_row = worksheet.max_row + 2 # Placing chart after the data with a buffer of one row + + # Add the chart to the worksheet + worksheet.add_chart(chart, f"{col}{chart_row}") + + return worksheet + + def create_histogram_to_sheet_feature(self, day, featuremean, featuremedian, sheet_title, worksheet, col): + # Extract keys and values from dictionaries + keys = list(featuremean.keys()) + values_mean = list(featuremean.values()) + values_median = list(featuremedian.values()) + + # Create a bar chart + chart = BarChart() + chart.title = f"{day}: {sheet_title}" + chart.x_axis.title = 'Feature' + chart.y_axis.title = 'SHAP-value' + + data_ref = Reference(worksheet, min_col=2, min_row=110, max_col=3, max_row=110+len(keys) + 1) + categories_ref = Reference(worksheet, min_col=1, min_row=110, max_row=110+len(keys) + 1) + chart.add_data(data_ref, titles_from_data=True) + chart.set_categories(categories_ref) + + # Calculate the row number for placing the chart + chart_row = worksheet.max_row + 2 # Placing chart after the data with a buffer of one row + + # Add the chart to the worksheet + worksheet.add_chart(chart, f"{col}{chart_row}") + + return worksheet + + # Creates an Excel sheet with relevant statistics + def create_sheet(self, day, sheet_title): + sheet = self.workbook.copy_worksheet(self.workbook.active) + sheet.title = sheet_title + headers = ['Mean', 'Median', 'Standard Deviation', 'Variance', 'Minimum', 'Maximum', 'Sum', 'Metadata'] + header_row = headers + lambdameans = { + '5':[], + '3':[], + '1':[], + '0.1':[], + '0.01':[] + } + lambdamean = { + '5': None, + '3': None, + '1': None, + '0.1': None, + '0.01': None + } + lambdamedians = { + '5':[], + '3':[], + '1':[], + '0.1':[], + '0.01':[] + } + lambdamedian = { + '5': None, + '3': None, + '1': None, + '0.1': None, + '0.01': None + } + featuremeans = { + 'weight':[], + 'mean':[], + 'standard deviation':[], + 'radius':[], + 'magnitude':[], + 'covariance':[], + 'pearson correlation coefficient':[] + } + featuremean = { + 'weight': None, + 'mean': None, + 'standard deviation': None, + 'radius': None, + 'magnitude': None, + 'covariance': None, + 'pearson correlation coefficient': None + } + featuremedians = { + 'weight':[], + 'mean':[], + 'standard deviation':[], + 'radius':[], + 'magnitude':[], + 'covariance':[], + 'pearson correlation coefficient':[] + } + featuremedian = { + 'weight': None, + 'mean': None, + 'standard deviation': None, + 'radius': None, + 'magnitude': None, + 'covariance': None, + 'pearson correlation coefficient': None + } + for col, value in enumerate(header_row): + cell = sheet.cell(row=1, column=6 + col) + cell.value = value + for idx, num_list in enumerate(self.shap_values.T): + mean = np.mean(num_list) + median = np.median(num_list) + std_dev = np.std(num_list) + variance = np.var(num_list) + minimum = np.min(num_list) + maximum = np.max(num_list) + total_sum = np.sum(num_list) + if idx+1 <= 20: + lambdameans['5'].append(mean) + lambdamedians['5'].append(median) + if idx+1 > 20 and idx+1 <= 40: + lambdameans['3'].append(mean) + lambdamedians['3'].append(median) + if idx+1 > 40 and idx+1 <= 60: + lambdameans['1'].append(mean) + lambdamedians['1'].append(median) + if idx+1 > 60 and idx+1 <= 80: + lambdameans['0.1'].append(mean) + lambdamedians['0.1'].append(median) + if idx+1 > 80 and idx+1 <= 100: + lambdameans['0.01'].append(mean) + lambdamedians['0.01'].append(median) + + if idx+1 in [1, 4, 11, 14, 21, 24, 31, 34, 41, 44, 51, 54, 61, 64, 71, 74, 81, 84, 91, 94]: + featuremeans['weight'].append(mean) + featuremedians['weight'].append(median) + if idx+1 in [2, 5, 12, 15, 22, 25, 32, 35, 42, 45, 52, 55, 62, 65, 72, 75, 82, 85, 92, 95]: + featuremeans['mean'].append(mean) + featuremedians['mean'].append(median) + if idx + 1 in [3, 6, 13, 16, 23, 26, 33, 36, 43, 46, 53, 56, 63, 66, 73, 76, 83, 86, 93, 96]: + featuremeans['standard deviation'].append(mean) + featuremedians['standard deviation'].append(median) + if idx + 1 in [7, 17, 27, 37, 47, 57, 67, 77, 87, 97]: + featuremeans['radius'].append(mean) + featuremedians['radius'].append(median) + if idx + 1 in [8, 18, 28, 38, 48, 58, 68, 78, 88, 98]: + featuremeans['magnitude'].append(mean) + featuremedians['magnitude'].append(median) + if idx + 1 in [9, 19, 29, 39, 49, 59, 69, 79, 89, 99]: + featuremeans['covariance'].append(mean) + featuremedians['covariance'].append(median) + if idx + 1 in [10, 20, 30, 40, 50, 60, 70, 80, 99, 100]: + featuremeans['pearson correlation coefficient'].append(mean) + featuremedians['pearson correlation coefficient'].append(median) + row_data = [mean, median, std_dev, variance, minimum, maximum, total_sum] + + for col, value in enumerate(row_data): + cell = sheet.cell(row=idx + 2, column=6 + col) + cell.value = value + + row = idx + 2 + row += 1 + cell = sheet.cell(row=row, column=1) + cell.value = "Grouped by lambda" + cell = sheet.cell(row=row, column=2) + cell.value = "mean" + cell = sheet.cell(row=row, column=3) + cell.value = "median" + row += 1 + + for key in lambdameans: + cell = sheet.cell(row=row, column = 1) + cell.value = key + cell = sheet.cell(row=row, column = 2) + cell.value = np.mean(np.array(lambdameans[key])) + lambdamean[key] = np.mean(np.array(lambdameans[key])) + cell = sheet.cell(row=row, column=3) + cell.value = np.median(np.array(lambdamedians[key])) + print(f"key: {key}, value: {np.mean(np.array(lambdamedians[key]))}") + lambdamedian[key] = np.median(np.array(lambdamedians[key])) + row += 1 + row += 1 + + cell = sheet.cell(row=row, column=1) + cell.value = "Grouped by feature" + cell = sheet.cell(row=row, column=2) + cell.value = "mean" + cell = sheet.cell(row=row, column=3) + cell.value = "median" + row += 1 + for key in featuremeans: + cell = sheet.cell(row=row, column=1) + cell.value = key + cell = sheet.cell(row=row, column=2) + cell.value = np.mean(np.array(featuremeans[key])) + featuremean[key] = np.mean(np.array(featuremeans[key])) + cell = sheet.cell(row=row, column=3) + cell.value = np.median(np.array(featuremedians[key])) + featuremedian[key] = np.median(np.array(featuremedians[key])) + row += 1 + self.create_histogram(day, featuremean, featuremedian, sheet_title+" grouped by feature name") + self.create_histogram_to_sheet_feature(day, featuremean, featuremedian, sheet_title + " grouped by feature name", sheet, "A") + self.create_histogram(day, lambdamean, lambdamedian, sheet_title + " grouped by lambda value") + self.create_histogram_to_sheet_lambda(day, lambdamean, lambdamedian, sheet_title + " grouped by lambda value", sheet, "F") + row += 1 + + color_indices = self.get_high_low_indices() + stat_columns = { + 'mean': "F", + 'median': "G", + 'std_dev': "H", + 'variance': "I", + 'minimum': "J", + 'maximum': "K", + 'total_sum': "L" + } + for stat in color_indices: + for index in color_indices[stat]["highest_indices"]: + cell_index = stat_columns[stat] + str(index + 2) + if stat == "std_dev" or stat == "variance": + # Make largest three standard deviation and variance values blue + sheet[cell_index].fill = PatternFill(start_color="ADD8E6", end_color="ADD8E6", fill_type="solid") + elif stat == "minimum": + # Make largest three cells minimum red + sheet[cell_index].fill = PatternFill(start_color="FF0000", end_color="FF0000", fill_type="solid") + else: + # In all other cases, make largest three cells green + sheet[cell_index].fill = PatternFill(start_color="00FF00", end_color="00FF00", fill_type="solid") + for index in color_indices[stat]["lowest_indices"]: + cell_index = stat_columns[stat] + str(index + 2) + if stat == "minimum": + # Make largest three cells minimum red + sheet[cell_index].fill = PatternFill(start_color="FF0000", end_color="FF0000", fill_type="solid") + else: + # In all other cases, make smallest three cells red + sheet[cell_index].fill = PatternFill(start_color="FF0000", end_color="FF0000", fill_type="solid") + # Fill in metadata + start_row = 2 + start_column_keys = 'M' + start_column_values = 'N' + + # Loop over the dictionary and write capitalized keys and values to cells + if hasattr(self, "metadata"): + for idx, (key, value) in enumerate(self.metadata.items()): + capitalized_key = key[0].upper() + key[1:] + key_cell = f"{start_column_keys}{start_row + idx}" + value_cell = f"{start_column_values}{start_row + idx}" + sheet[key_cell] = capitalized_key + sheet[value_cell] = value + return sheet + + # Runs a series of Kitsune models and calculates statistics for each run. + def run_series_stats(self, inputs): + self.workbook = openpyxl.load_workbook('input_data/template_statistics_file.xlsx') + # Loop over the different Kitsune configs we are going to make + for session in inputs: + self.features_list = None + self.explainer = None + self.shap_values = None + self.K = Kitsune(inputs[session]["input_path"], inputs[session]["packet_limit"], inputs[session]["maxAE"], inputs[session]["FMgrace"], inputs[session]["ADgrace"]) + self.metadata = { + "filename": inputs[session]["input_path"], + "packet_limit": inputs[session]["packet_limit"], + "maxAE": inputs[session]["maxAE"], + "FMgrace": inputs[session]["FMgrace"], + "ADgrace": inputs[session]["ADgrace"], + "timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S") + } + self.feature_builder() + self.kit_trainer(inputs[session]["training_min"], inputs[session]["training_max"]) + if inputs[session]["input_path"] != inputs[session]["input_path_test"]: + self.testKit = Kitsune(inputs[session]["input_path_test"], inputs[session]["packet_limit"], inputs[session]["maxAE"], inputs[session]["FMgrace"], inputs[session]["ADgrace"]) + self.testFeatures = self.testKit.get_feature_list() + self.shap_values_builder(inputs[session]["training_min"], inputs[session]["training_max"], inputs[session]["testing_min"], inputs[session]["testing_max"]) + self.create_sheet(session) + excel_file = "summary_statistics_" + datetime.now().strftime('%d-%m-%Y_%H-%M') + ".xlsx" + self.workbook.save(excel_file) + + # Runs a hyperparameter optimization on the supplied dataset, constrained by number of runs and packet limit + def hyper_opt(self, input_path, runs, packet_limit, load=False): + if load: + self.feature_loader() + else: + self.K = Kitsune(input_path, packet_limit * 1.3, 10, 5000, 50000, 0.1, 0.75) + self.feature_builder() + self.feature_pickle() + + def objective(trial): + numAE = trial.suggest_int('numAE', 1, 10) + learning_rate = trial.suggest_float('learning_rate', 0.01, 0.5) + hidden_ratio = trial.suggest_float('hidden_ratio', 0.5, 0.8) + + self.K = Kitsune(input_path, packet_limit*1.3, numAE, int(0.1*packet_limit), int(0.6*packet_limit), learning_rate, hidden_ratio) + # Load the feature list beforehand to save time + self.feature_loader() + print('training on '+str(int(0.7*packet_limit))+' packets') + self.kit_trainer(0, int(0.7*packet_limit)) + + y_test = np.zeros((int(0.2*packet_limit), 1)) + y_pred = self.kit_runner(int(0.7*packet_limit), int(0.9*packet_limit)) + + # Do small test run with benign sample to find normalization + print("Calculating normalization sample") + #benignSample = np.log(self.kit_runner(int(0.5*packet_limit), int(0.6*packet_limit))) + #logProbs = norm.logsf(np.log(y_pred), np.mean(benignSample), np.std(benignSample)) + print('predictions') + print(y_pred) + #print('normalization sample') + #print(benignSample) + #print('logProbs') + #print(logProbs) + error = sklearn.metrics.mean_squared_error(y_test, y_pred) + + print('error') + print(error) + return error + + study = optuna.create_study() + study.optimize(objective, n_trials=runs) + + # Create a new workbook and select the active worksheet + wb = Workbook() + ws = wb.active + + # Write header row + header = ["Trial Number", "numAE", "learning_rate", "hidden_ratio"] + ws.append(header) + + # Write trial information + best_value = float("inf") + best_row_idx = None # Track the index of the best row + for idx, trial in enumerate(study.trials, start=2): # Start from row 2 to leave room for the header + trial_params = trial.params + trial_row = [trial.number, trial_params["numAE"], trial_params["learning_rate"], trial_params["hidden_ratio"], trial.value] + ws.append(trial_row) + + if trial.value < best_value: + best_value = trial.value + best_row_idx = idx + + # Set fill color for the best value row + green_fill = PatternFill(start_color="00FF00", end_color="00FF00", fill_type="solid") + if best_row_idx is not None: + for cell in ws[best_row_idx]: + cell.fill = green_fill + + # Save the workbook to a file + excel_file_path = "output_data/hyperparameter_optimization_results_" + datetime.now().strftime('%d-%m-%Y_%H-%M') + ".xlsx" + wb.save(excel_file_path) + + print("Results exported to", excel_file_path) + return study.best_trial + + # Calculates an EER-score for a list of RMSEs + def calc_eer(self, RMSEs, labels): + fpr, tpr, threshold = sklearn.metrics.roc_curve(labels, RMSEs, pos_label=1) + fnr = 1-tpr + #eer_threshold = threshold[np.nanargmin(np.absolute((fnr-fpr)))] + EER = fpr[np.nanargmin(np.absolute((fnr-fpr)))] + return EER + + # Calculates an AUC-score for a list of RMSEs and a list of expected values + def calc_auc(self, RMSEs, labels): + auc_score = sklearn.metrics.roc_auc_score(labels, RMSEs) + return auc_score + + # Calculates an EER-score for a list of RMSEs and a list of expected values + def calc_auc_eer(self, RMSEs, labels): + return (self.calc_auc(RMSEs, labels), self.calc_eer(RMSEs, labels)) + + # DEPRECATED Takes a random sample from a .pcap file, limited by the supplied sample size + #def random_sample_pcap(self, input_path, output_path, sample_size): + # # Initialize the sampled_packets list and a counter + # sampled_packets = [] + # counter = 0 + + # Open the PCAP file for reading + # with PcapReader(input_path) as pcap_reader: + # for packet in pcap_reader: + # counter += 1 + # if counter % 10000 == 0: + # print(counter) + # if len(sampled_packets) < sample_size: + # sampled_packets.append(packet) + # else: + # # Randomly decide whether to add the new packet or not + # probability = sample_size / counter + # if random.random() < probability: + # random_index = random.randint(0, sample_size - 1) + # sampled_packets[random_index] = packet + + # Write the sampled packets to a new PCAP file while preserving the order + # wrpcap(output_path, sampled_packets) + + # print(f"Sampled {sample_size} packets and saved to {output_path}") + + # DEPRECATED Takes the first n percentage out of every 1000 packets, does the same for the next 1000 packets + #def interval_sample_pcap(self, input_path, output_path, percentage): + # # Initialize the sampled_packets list and a counter + # sampled_packets = [] + # counter = 0 + + # # Open the PCAP file for reading + # with PcapReader(input_path) as pcap_reader: + # for packet in pcap_reader: + # counter += 1 + # if counter % 10000 == 0: + # print(counter) + + # if counter % 1000 <= (1000*(percentage/100)): # Sample the first 100 out of every 1000 packets + # sampled_packets.append(packet) + + # Write the sampled packets to a new PCAP file while preserving the order + # wrpcap(output_path, sampled_packets) + + # print(f"Sampled the first 100 packets out of every 1000 and saved to {output_path}") + + # DEPRECATED Extracts the conversations from a pcap-file + #def extract_conversations(self, input_path): + # print('Reading pcap-file') + # conversations = [] + # current_conversation = [] + # counter = 0 + + # with PcapReader(input_path) as pcap_reader: + # for packet in pcap_reader: + # counter += 1 + # if counter % 10000 == 0: + # print(f"{counter} packets processed") + + # if IP in packet: + # if TCP in packet: + # conversation_key = (packet[IP].src, packet[IP].dst, packet[TCP].sport, packet[TCP].dport) + # elif UDP in packet: + # conversation_key = (packet[IP].src, packet[IP].dst, packet[UDP].sport, packet[UDP].dport) + # else: + # continue + + # if conversation_key not in current_conversation: + # current_conversation.append(conversation_key) + # conversations.append([]) + + # conversations[current_conversation.index(conversation_key)].append(packet) + + # self.conversations_list = conversations + # return conversations + + # DEPRECATED Writes a list of conversations to a pcap-file + #def create_pcap_from_conversations(self, conversations, output_path): + # print('Writing packets to pcap-file') + # packets_to_write = [] + + # for conversation in conversations: + # packets_to_write.extend(conversation) + + # with PcapWriter(output_path) as pcap_writer: + # pcap_writer.write(packets_to_write) + + # Sample a percentage of conversations (not of packets) + # def sample_percentage_conversations(self, percentage, input_path, output_path=None): + # conversation_list = self.extract_conversations(input_path) + # print(f'Sampling {percentage} percent of conversations') + # sampled_conversations = random.sample(conversation_list, int(0.01 * percentage * len(conversation_list))) + + # if output_path is not None: + # self.create_pcap_from_conversations(sampled_conversations, output_path) + + # self.conversations_list = sampled_conversations + # return sampled_conversations + + # DEPRECATED Trains Kitsune on a list of conversations + # def train_Kitsune_on_conversations(self, conversation_list): + # self.K = Kitsune("input_data/empty.pcap", np.Inf, 6, math.floor(len(conversation_list)*0.1), math.floor(len(conversation_list)*0.9)) + # for conversation in conversation_list: + # self.K.feed_batch(conversation) + + # DEPRECATED Runs Kitsune on a list of conversations and returns a list of anomaly-scores per conversation + #def run_Kitsune_on_conversations(self, conversation_list, threshold): + # result_list = [] + # malicious = 0 + # for conversation in conversation_list: + # result = self.K.feed_batch(conversation) + # # Normalize result if maximum is a positive + # if max(result) >= 1.0: + # result = [float(i) / max(result) for i in result] + # # If one of the results is higher than the threshold, then mark as malicious + # if max(result) > threshold: + # malicious = 1 + # # Add a tuple of conversation and malicious/benign + # result_list.append((conversation, malicious)) + # return result_list + + # Loads conversations list from a pickle file + def conversations_loader(self, newpickle=None): + print("Loading conversations from file") + path = 'pickles/conversationsList.pkl' + if newpickle != None: + path = newpickle + with open(path, 'rb') as f: + conversations_list = pickle.load(f) + self.conversations_list = conversations_list + return conversations_list + + # Writes conversation list to a pickle file + def conversation_pickle(self, newpickle=None): + print("Writing conversations to file") + path = 'pickles/conversationsList.pkl' + if newpickle != None: + path = newpickle + with open(path, 'wb') as f: + pickle.dump(self.conversations_list, f) + + # DEPRECATED Verifies a batch of conversations to be benign or malicious + #def verify_test_results(self, conv_list, threshold): + # result_list = [] + # for conv in conv_list: + # # If one of the results is higher than the threshold, then mark as malicious + # malicious = 0 + # if max(conv[1]) > threshold: + # malicious = 1 + # result_list.append((conv[0], malicious)) + # return result_list + + def load_pcap_to_features(self, input_path): + print('Running dummy instance of Kitsune') + dummyKit = Kitsune(input_path, np.Inf, 6, 10, 15) + self.features_list = dummyKit.get_feature_list() + return self.features_list + + def read_label_file(self, csvpath): + with open(csvpath, newline='') as csvfile: + returnList = [] + labelreader = csv.reader(csvfile, delimiter=' ') + for row in labelreader: + row = row[0].strip('][').split(',') + returnList.append(row) + return returnList + + def sample_packets_by_conversation(self, tsvpath, outpath, labels): + # We open the output writer to write to a new TSV file + with open(outpath, 'w') as op: + wr = csv.writer(op) + # We open the reader to get the packets from the original TSV file + with open(tsvpath) as fd: + rd = csv.reader(fd, delimiter="\t", quotechar='"') + pkt_iter = -1 + for row in rd: + if pkt_iter % 100000 == 0: + print(pkt_iter) + if pkt_iter == -1: + pkt_iter += 1 + continue + # Labels is the list of conversations, that has previously been sampled to 10 percent of conversations + for label in labels: + if label[0] == 'Src': + continue + if (row[4] == label[0] and row[6] == label[1] and row[5] == label[2] and row[7] == label[3]) or (row[4] == label[2] and row[6] == label[3] and row[5] == label[0] and row[7] == label[1]): + label_iter = label[5] + label_val = label[4] + row.append(str(pkt_iter)) + row.append(str(label_iter)) + row.append(str(label_val)) + wr.writerow(row) + break + pkt_iter += 1 + op.close() + + def map_packets_to_features(self, packet_path, feature_path, sampled_feature_path): + # Step 1: Read the packet TSV file and create a set of packet indices + subset_indices = set() + row_index = 0 + with open(packet_path, 'r', newline='') as packet_file: + csvreader = csv.reader(packet_file) + for row in csvreader: + if row: + #packet_index = int(row[19]) # Assuming index is in the 20th column + packet_index = int(row[22]) # Assuming index is in the 23rd column + subset_indices.add(packet_index) + row_index += 1 + # Step 2: Read the required statistics from the large feature CSV file + # and write them to the output CSV file + with open(feature_path, 'r', newline='') as feature_file, open(sampled_feature_path, 'w', newline='') as output_file: + csvreader = csv.reader(feature_file) + csvwriter = csv.writer(output_file) + + for row_num, row in enumerate(csvreader, start=1): + packet_index = row_num # Index is the row number + # Check if the packet index is in the list of subset indices + if packet_index in subset_indices: + # Write the row to the output CSV file + csvwriter.writerow(row) + + # Runs a hyperparameter optimization on the supplied dataset, constrained by number of runs and packet limit + # This version uses KitNET directly instead of running Kitsune as a whole + def hyper_opt_KitNET(self, feature_path, training_cutoff): + def objective(trial): + numAE = trial.suggest_int('numAE', 0, 200) + learning_rate = trial.suggest_float('learning_rate', 0, 0.5) + hidden_ratio = trial.suggest_float('hidden_ratio', 0, 1) + FMgrace = trial.suggest_int('FMgrace', 0, 500000) + + kit = KitNET(100, max_autoencoder_size=numAE, FM_grace_period=FMgrace, AD_grace_period=math.floor(training_cutoff*0.9), learning_rate=learning_rate, hidden_ratio=hidden_ratio) + # Load the feature list beforehand to save time + counter = 0 + + with open(feature_path) as fp: + rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + train_err = [] + for packet in rd_ft: + if packet: + packet = packet[0].split(',') + packet = [float(element) for element in packet] + packet = np.array(packet) + if counter % 10000 == 0: + print("training: "+str(counter)) + if counter <= training_cutoff: + train_err.append(kit.train(packet)) + else: + break + counter += 1 + fp.close() + + y_pred = [] + path = 'pickles/validateFeatureList.pkl' + counterValidate = 0 + with open(path, 'rb') as f: + validateList = pickle.load(f) + for packet in validateList: + score = kit.execute(packet) + if counterValidate % 100000: + print("testing: "+str(counterValidate)) + y_pred.append(score) + + trial.set_user_attr("training_error", np.mean(train_err)) + trial.set_user_attr("train_median", np.median(train_err)) + trial.set_user_attr("train_25_percentile", np.percentile(train_err, 25)) + trial.set_user_attr("train_75_percentile", np.percentile(train_err, 75)) + trial.set_user_attr("train_max", np.max(train_err)) + trial.set_user_attr("testing_error", np.mean(train_err)) + trial.set_user_attr("test_median", np.median(train_err)) + trial.set_user_attr("test_25_percentile", np.percentile(train_err, 25)) + trial.set_user_attr("test_75_percentile", np.percentile(train_err, 75)) + trial.set_user_attr("test_max", np.max(train_err)) + + median_value = np.median(train_err) + median_absolute_deviation = np.median([abs(number - median_value) for number in train_err]) + trial.set_user_attr("mad", median_absolute_deviation) + + threshold = median_value + 2 * median_absolute_deviation + trial.set_user_attr("threshold", threshold) + + trial.set_user_attr("test_minus_train_error", np.mean(y_pred)-np.mean(train_err)) + + anomaly_count = 0 + for err in y_pred: + if err > threshold: + anomaly_count += 1 + + trial.set_user_attr("anomaly_count", anomaly_count) + trial.set_user_attr("train_packets", training_cutoff) + trial.set_user_attr("test_packets", len(y_pred)) + + FPR = anomaly_count / len(y_pred) + return FPR + + # Dashboard logic + search_space = { + 'numAE': [5, 10, 15, 25, 50, 75, 150], + 'learning_rate': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1], + 'hidden_ratio': [0.25, 0.5, 0.75], + 'FMgrace': [math.floor(0.05*training_cutoff), math.floor(0.10*training_cutoff), math.floor(0.20 * training_cutoff)] + } + search_space = { + 'numAE': [50], + 'learning_rate': [0.0001], + 'hidden_ratio': [0.25], + 'FMgrace': [math.floor(0.05*training_cutoff)] + } + name = "mad2_hyperopt" + str(training_cutoff) + "learnlowerhigheragainappend" + study = optuna.create_study(sampler=optuna.samplers.GridSampler(search_space), storage="sqlite:///hyperopt.db", study_name=name) + study.optimize(objective, n_trials=27) + + # Create a new workbook and select the active worksheet + wb = Workbook() + ws = wb.active + + # Write header row + header = ["Trial Number", "numAE", "learning_rate", "hidden_ratio"] + ws.append(header) + + # Write trial information + best_value = float("inf") + best_row_idx = None # Track the index of the best row + for idx, trial in enumerate(study.trials, start=2): # Start from row 2 to leave room for the header + trial_params = trial.params + trial_row = [trial.number, trial_params["numAE"], trial_params["learning_rate"], + trial_params["hidden_ratio"], trial.value] + ws.append(trial_row) + + if trial.value < best_value: + best_value = trial.value + best_row_idx = idx + + # Set fill color for the best value row + green_fill = PatternFill(start_color="00FF00", end_color="00FF00", fill_type="solid") + if best_row_idx is not None: + for cell in ws[best_row_idx]: + cell.fill = green_fill + + # Save the workbook to a file + excel_file_path = "output_data/hyperparameter_optimization_results_" + datetime.now().strftime( + '%d-%m-%Y_%H-%M') + ".xlsx" + wb.save(excel_file_path) + + print("Results exported to", excel_file_path) + return study.best_trial + + # Runs a hyperparameter optimization on the supplied dataset, constrained by number of runs and packet limit + # This version uses KitNET directly instead of running Kitsune as a whole + def hyper_opt_KitNET_mean(self, feature_path, training_cutoff, total_cutoff): + def objective(trial): + numAE = trial.suggest_int('numAE', 0, 200) + learning_rate = trial.suggest_float('learning_rate', 0, 0.5) + hidden_ratio = trial.suggest_float('hidden_ratio', 0, 1) + FMgrace = trial.suggest_int('FMgrace', 0, 500000) + + kit = KitNET(100, max_autoencoder_size=numAE, FM_grace_period=FMgrace, AD_grace_period=math.floor(training_cutoff*0.9), learning_rate=learning_rate, hidden_ratio=hidden_ratio) + # Load the feature list beforehand to save time + iter = 0 + with open(feature_path) as fp: + rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + + train_err = [] + y_pred = [] + for packet in rd_ft: + if packet: + packet = packet[0].split(',') + packet = [float(element) for element in packet] + packet = np.array(packet) + if iter % 10000 == 0: + print(iter) + if iter < total_cutoff: + if iter <= training_cutoff: + train_err.append(kit.train(packet)) + else: + score = kit.execute(packet) + y_pred.append(score) + iter += 1 + else: + break + fp.close() + + trial.set_user_attr("training_error", np.mean(train_err)) + error = np.mean(y_pred) + print('error') + print(error) + return error + + # Dashboard logic + search_space = { + 'numAE': [5, 10, 15, 25, 50, 75, 150], + 'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.13, 0.2], + 'hidden_ratio': [0.25, 0.5, 0.75], + 'FMgrace': [math.floor(0.05*training_cutoff), math.floor(0.10*training_cutoff), math.floor(0.20 * training_cutoff)] + } + name = "hyperopt_mean_" + str(total_cutoff) + study = optuna.create_study(sampler=optuna.samplers.GridSampler(search_space), storage="sqlite:///hyperopt.db", study_name=name) + study.optimize(objective, n_trials=7*7*3*4) + + # Create a new workbook and select the active worksheet + wb = Workbook() + ws = wb.active + + # Write header row + header = ["Trial Number", "numAE", "learning_rate", "hidden_ratio"] + ws.append(header) + + # Write trial information + best_value = float("inf") + best_row_idx = None # Track the index of the best row + for idx, trial in enumerate(study.trials, start=2): # Start from row 2 to leave room for the header + trial_params = trial.params + trial_row = [trial.number, trial_params["numAE"], trial_params["learning_rate"], + trial_params["hidden_ratio"], trial.value] + ws.append(trial_row) + + if trial.value < best_value: + best_value = trial.value + best_row_idx = idx + + # Set fill color for the best value row + green_fill = PatternFill(start_color="00FF00", end_color="00FF00", fill_type="solid") + if best_row_idx is not None: + for cell in ws[best_row_idx]: + cell.fill = green_fill + + # Save the workbook to a file + excel_file_path = "output_data/hyperparameter_optimization_results_" + datetime.now().strftime( + '%d-%m-%Y_%H-%M') + ".xlsx" + wb.save(excel_file_path) + + print("Results exported to", excel_file_path) + return study.best_trial + + # Calculates KitNET's SHAP-values for the specified indexes + def shap_values_builder_separate_train_test_csv(self, train_path, test_path, training_cutoff, test_cutoff, numAE, learning_rate, hidden_ratio): + self.KitTest = KitNET(100, numAE, math.floor(training_cutoff * 0.1), math.floor(training_cutoff*0.9), learning_rate, hidden_ratio) + # Load CSV file since it probably will not be too big + with open(train_path) as fp: + rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + train_features = [] + for packet in rd_ft: + if packet: + packet = packet[0].split(',') + packet = [float(element) for element in packet] + packet = np.array(packet) + train_features.append(packet) + fp.close() + print('Done building train feature array') + + # Load CSV file since it probably will not be too big + with open(test_path) as fp: + rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + test_features = [] + iter = 0 + for packet in rd_ft: + if packet: + if iter >= test_cutoff: + break + packet = packet[0].split(',') + packet = [float(element) for element in packet] + packet = np.array(packet) + test_features.append(packet) + iter += 1 + fp.close() + print('Done building test feature array') + + trainfeaturesNP = np.array(train_features) + + print('Training KitNET') + self.KitTest.process_batch(trainfeaturesNP[:training_cutoff]) + print("Building SHAP explainer") + self.explainer = shap.Explainer(self.kitnet_model, trainfeaturesNP[:training_cutoff]) + print("Calculating SHAP values") + newfeatures = random.sample(test_features, 40) + # Get 40 random packets from test set + self.shap_values = self.explainer.shap_values(np.array(newfeatures)) + self.metadata = { + "filename": train_path, + "packet_limit": training_cutoff, + "num_autenc": numAE, + "FMgrace": math.floor(training_cutoff * 0.1), + "ADgrace": math.floor(training_cutoff * 0.9), + "timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S") + } + return self.shap_values + + def shap_values_builder_from_csv(self, path, training_cutoff, total_cutoff, numAE, learning_rate, hidden_ratio): + self.KitTest = KitNET(100, numAE, math.floor(training_cutoff * 0.1), math.floor(training_cutoff * 0.9), + learning_rate, hidden_ratio) + # Load CSV file since it probably will not be too big + with open(path) as fp: + rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + features = [] + for packet in rd_ft: + if packet: + packet = packet[0].split(',') + packet = [float(element) for element in packet] + packet = np.array(packet) + features.append(packet) + fp.close() + print('Done building feature array') + + featuresNP = np.array(features) + + print('Training KitNET') + self.KitTest.process_batch(featuresNP[:training_cutoff]) + print("Building SHAP explainer") + self.explainer = shap.Explainer(self.kitnet_model, featuresNP[:training_cutoff]) + print("Calculating SHAP values") + newfeatures = features[training_cutoff:total_cutoff] + newfeatures = random.sample(newfeatures, 100) + # Get 40 random packets from test set + self.shap_values = self.explainer.shap_values(np.array(newfeatures)) + self.metadata = { + "filename": path, + "packet_limit": total_cutoff, + "num_autenc": numAE, + "FMgrace": math.floor(training_cutoff * 0.1), + "ADgrace": math.floor(training_cutoff * 0.9), + "timestamp": datetime.now().strftime("%d/%m/%Y %H:%M:%S") + } + return self.shap_values + + def run_kitsune_from_feature_csv(self, feature_path, training_cutoff, total_cutoff, numAE, learning_rate, hidden_ratio): + kit = KitNET(100, numAE, math.floor(training_cutoff * 0.05), training_cutoff * 0.9, learning_rate, + hidden_ratio) + # Load the feature list beforehand to save time + y_pred = [] + counter = 0 + with open(feature_path) as fp: + rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + + for packet in rd_ft: + if packet: + packet = packet[0].split(',') + packet = [float(element) for element in packet] + packet = np.array(packet) + if counter % 10000 == 0: + print(counter) + if counter < total_cutoff: + if counter <= training_cutoff: + kit.train(packet) + else: + score = kit.execute(packet) + y_pred.append(score) + counter += 1 + else: + break + fp.close() + print("Writing anomaly detector to file") + path = 'pickles/anomDetectorFullDataset.pkl' + with open(path, 'wb') as f: + pickle.dump(kit, f) + return y_pred + + def run_trained_kitsune_from_feature_csv(self, test_path, test_start, test_limit): + #kit = KitNET(100, 10, math.floor(12000000 * 0.05), math.floor(12000000 * 0.9), 0.30, 0.25) + # kit = KitNET(100, 50, math.floor(10000000 * 0.05), 10000000, 0.0005, 0.25) + # + # + # + # counter = 0 + # with open('input_data/features.csv') as fp: + # rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + # + # for packet in rd_ft: + # if packet: + # packet = packet[0].split(',') + # packet = [float(element) for element in packet] + # packet = np.array(packet) + # if counter % 10000 == 0: + # print(counter) + # if counter < math.floor(700000): + # kit.train(packet) + # counter += 1 + # else: + # break + # fp.close() + # path = 'pickles/anomDetectorFullDataset2.pkl' + # with open(path, 'wb') as f: + # pickle.dump(kit, f) + # print('testing') + # quit() + # # Load the feature list beforehand to save time + # counter = 0 + # print(test_start) + # print(test_limit) + # with open(test_path) as fp: + # rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + # resultList = [] + # for packet in rd_ft: + # if counter % 10000 == 0: + # print(counter) + # if packet and counter > test_start: + # print('testing1') + # print(counter) + # packet = packet[0].split(',') + # packet = [float(element) for element in packet] + # packet = np.array(packet) + # if counter < test_limit: + # print('test2') + # resultList.append(kit.execute(packet)) + # else: + # break + # counter += 1 + # fp.close() + # print("Writing anomaly detector to file") + # path = 'pickles/anomDetector.pkl' + # with open(path, 'wb') as f: + # pickle.dump(kit, f) + with open("pickles/anomDetectorFullDataset.pkl", 'rb') as f: + kit = pickle.load(f) + + counter = 0 + results = [] + with open(test_path) as fp: + rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + + for packet in rd_ft: + if packet: + packet = packet[0].split(',') + packet = [float(element) for element in packet] + packet = np.array(packet) + if counter % 10000 == 0: + print('running: ') + print(counter) + if counter < test_limit: + results.append(kit.execute(packet)) + counter += 1 + else: + break + fp.close() + return results + + def run_trained_kitsune_from_tsv(self, test_path, test_limit): + path = 'pickles/anomDetectorFullDataset.pkl' + with open(path, 'rb') as f: + kit = pickle.load(f) + + # Load the feature list beforehand to save time + #iter = 0 + #with open(feature_path) as fp: + #rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + + #for packet in rd_ft: + # if packet: + # packet = packet[0].split(',') + # packet = [float(element) for element in packet] + # packet = np.array(packet) + # if iter % 10000 == 0: + # print(iter) + # if iter < training_cutoff: + # kit.train(packet) + # iter += 1 + # else: + # break + #fp.close() + #print("Writing anomaly detector to file") + #path = 'pickles/anomDetector.pkl' + #with open(path, 'wb') as f: + # pickle.dump(kit, f) + #with open(path, 'rb') as f: + # newKit = pickle.load(f) + + counter = 0 + results = [] + with open(test_path) as fp: + rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + for packet in rd_ft: + if packet and counter > 0: + print(counter) + features = self.get_features_for_packet(packet) + results.append(kit.execute(features)) + counter += 1 + fp.close() + return results + + def map_results_to_conversation(self, results, pcap_path): + counter = 0 + conv_dict = {} + with open(pcap_path) as fp: + rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + for packet in rd_ft: + if counter < len(results): + if packet: + packet = packet[0].split(',') + result = results[counter] + conv_number = packet[23] + if conv_number not in conv_dict: + conv_dict[conv_number] = [] + conv_dict[conv_number].append(result) + counter += 1 + else: + break + fp.close() + return conv_dict + + def map_results_to_conversation_tuple(self, results, pcap_path): + counter = 0 + conv_dict = {} + with open(pcap_path) as fp: + rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + for packet in rd_ft: + if counter < len(results): + if packet: + packet = packet[0].split(',') + result = results[counter] + conv_number = packet[23] + if conv_number not in conv_dict: + conv_dict[conv_number] = [] + conv_dict[conv_number].append({counter: result}) + counter += 1 + else: + break + fp.close() + return conv_dict + + def run_kitsune_from_feature_pickle(self, feature_path, training_cutoff, total_cutoff, numAE, learning_rate, hidden_ratio, pickle_path=None): + kit = KitNET(100, numAE, math.floor(training_cutoff * 0.1), math.floor(training_cutoff * 0.9), learning_rate, hidden_ratio) + + #path = 'pickles/anomDetector.pkl' + #if pickle_path != None: + # path = pickle_path + #with open(path, 'rb') as f: + # kit = pickle.load(f) + + # Load the feature list beforehand to save time + iter = 0 + with open(feature_path) as fp: + rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + + y_pred = [] + for packet in rd_ft: + if packet: + packet = packet[0].split(',') + packet = [float(element) for element in packet] + packet = np.array(packet) + if iter % 10000 == 0: + print(iter) + if iter < total_cutoff: + if iter <= training_cutoff: + kit.train(packet) + else: + score = kit.execute(packet) + y_pred.append(score) + iter += 1 + else: + break + fp.close() + print("Writing anomaly detector to file") + path = 'pickles/anomDetector.pkl' + with open(path, 'wb') as f: + pickle.dump(kit, f) + return y_pred + + def get_features_for_packet(self, packet): + row = packet + #row = row[0].strip('][').split(',') + IPtype = np.nan + timestamp = row[0] + framelen = row[1] + srcIP = '' + dstIP = '' + tcpFlags = row[19] + payload = '' + # payload = int(row[20])+int(row[21]) + if row[4] != '': # IPv4 + srcIP = row[4] + dstIP = row[5] + IPtype = 0 + elif row[17] != '': # ipv6 + srcIP = row[17] + dstIP = row[18] + IPtype = 1 + srcproto = row[6] + row[ + 8] # UDP or TCP port: the concatenation of the two port strings will will results in an OR "[tcp|udp]" + dstproto = row[7] + row[9] # UDP or TCP port + srcMAC = row[2] + dstMAC = row[3] + if srcproto == '': # it's a L2/L1 level protocol + if row[12] != '': # is ARP + srcproto = 'arp' + dstproto = 'arp' + srcIP = row[14] # src IP (ARP) + dstIP = row[16] # dst IP (ARP) + IPtype = 0 + elif row[10] != '': # is ICMP + srcproto = 'icmp' + dstproto = 'icmp' + IPtype = 0 + elif srcIP + srcproto + dstIP + dstproto == '': # some other protocol + srcIP = row[2] # src MAC + dstIP = row[3] # dst MAC + ### Extract Features + try: + return self.nstat.updateGetStats(IPtype, srcMAC, dstMAC, srcIP, srcproto, dstIP, dstproto, + int(framelen), + float(timestamp), tcpFlags, payload) + except Exception as e: + print(e) + return [] + + def most_significant_packets_sampler(self, day, threshold): + root_folder = "." + attack_types_folder = os.path.join(root_folder, "input_data/attack_types") + pickles_folder = os.path.join(root_folder, "pickles/output_pickles_packet_basis") + + for attack_type in os.listdir(attack_types_folder): + if attack_type == f"{day}_features.csv" or attack_type == f"{day}_BENIGN.csv" or not (attack_type.startswith(day) and attack_type.endswith(".csv")): + continue + attack_type = attack_type.replace(".csv", "") + attack_type = attack_type.replace(f"{day}_features_", "") + + # Construct the file names for features and pickle file + feature_file_name = f"{day}_features_{attack_type}.csv" + pickle_file_name = f"{day.title()}_{attack_type}_results.pkl" + feature_file_path = os.path.join(attack_types_folder, feature_file_name) + pickle_file_path = os.path.join(pickles_folder, pickle_file_name) + + # Check if the pickle file exists + if not os.path.exists(pickle_file_path): + continue + # Load the pickle file containing reconstruction errors + with open(pickle_file_path, 'rb') as pickle_file: + reconstruction_errors = pickle.load(pickle_file) + + # Load the corresponding feature CSV file + features_df = pd.read_csv(feature_file_path, header=None) + # Sort the errors and get the indices of the 40 highest + #sorted_indices = list(filter(lambda x: x < len(features_df), np.argsort(reconstruction_errors)[-40:])) + conv_scores = self.map_results_to_conversation_tuple(reconstruction_errors, f"input_data/attack_types/{day}_{attack_type}.pcap.tsv") + max_packets = [] + for conv in conv_scores: + max_dict = max(conv_scores[conv], key=lambda x: list(x.values())[0]) + max_packets.append(max_dict) + + true_positive = [] + false_negative = [] + + for item in max_packets: + value = list(item.values())[0] # Extracting the value from the dictionary + if value > threshold: + true_positive.append(item) + else: + false_negative.append(item) + + + sorted_keys_tp = [list(d.keys())[0] for d in true_positive] + sorted_keys_fn = [list(d.keys())[0] for d in false_negative] + # Extract the significant features + significant_features_tp = features_df.iloc[sorted_keys_tp] + if len(significant_features_tp) > 0: + if len(significant_features_tp) > 40: + significant_features_tp = significant_features_tp.sample(n=40, replace=False) + # Define the output file name + print(f'writing {attack_type} to file') + output_file_name = f"{day}_features_{attack_type}_tp_most_significant.csv" + output_file_path = os.path.join(attack_types_folder, output_file_name) + # Save the significant features to a new CSV file + significant_features_tp.to_csv(output_file_path, index=False, header=False) + + significant_features_fn = features_df.iloc[sorted_keys_fn] + if len(significant_features_fn) > 0: + if len(significant_features_fn) > 40: + significant_features_fn = significant_features_fn.sample(n=40, replace=False) + # Define the output file name + output_file_name = f"{day}_features_{attack_type}_fn_most_significant.csv" + output_file_path = os.path.join(attack_types_folder, output_file_name) + # Save the significant features to a new CSV file + significant_features_fn.to_csv(output_file_path, index=False, header=False) + + def shap_values_builder_from_features(self, test_feature_path, benign_feature_path): + path = 'pickles/anomDetectorFullDataset.pkl' + with open(path, 'rb') as f: + kit = pickle.load(f) + + def callKit(featureList): + results = [] + for features in featureList: + results.append(kit.execute(features)) + return np.array(results) + + # Load CSV file since it probably will not be too big + with open(test_feature_path) as fp: + rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + test_features = [] + for feature in rd_ft: + feature = feature[0].split(',') + feature = [float(element) for element in feature] + feature = np.array(feature) + test_features.append(feature) + fp.close() + print('Done building training feature array') + + # Load CSV file since it probably will not be too big + with open(benign_feature_path) as fp: + rd_ft = csv.reader(fp, delimiter="\t", quotechar='"') + benign_features = [] + for feature in rd_ft: + feature = feature[0].split(',') + feature = [float(element) for element in feature] + feature = np.array(feature) + benign_features.append(feature) + fp.close() + + print("Building SHAP explainer") + explainer = shap.KernelExplainer(callKit, np.array(benign_features[:40])) + print("Calculating SHAP values") + self.shap_values = explainer.shap_values(np.array(test_features[:40])) + return self.shap_values + + # Calculates SHAP-values for each available attack type in a day of the week; writes results to Excel and pickles results + def shap_documenter(self, day): + root_folder = "." + attack_types_folder = os.path.join(root_folder, "input_data/attack_types") + self.workbook = openpyxl.load_workbook(f'input_data/template_statistics_file.xlsx') + count = 0 + for attack_type in os.listdir(attack_types_folder): + if not (attack_type.startswith(day) and attack_type.endswith("most_significant.csv")): + continue + if not 'XSS' in attack_type: + continue + print('bingo') + attack_type = attack_type.replace(".csv", "") + attack_type = attack_type.replace(f"{day}_features_", "") + # Loop over the different Kitsune configs we are going to make + shap_values = self.shap_values_builder_from_features( + f"input_data/attack_types/{day}_features_{attack_type}.csv", + "input_data/attack_types/monday_features_sample_medium_validate.csv") + + path = f'pickles/output_pickles/{day.title()}_{attack_type}shap_results.pkl' + with open(path, 'wb') as f: + pickle.dump(shap_values, f) + # Could do this with a Regular Expression, but I'm a sane person + self.create_sheet(day, attack_type.replace("most_significant", "").replace("-", "").replace("_", "").replace(" ", "")) + count += 1 + excel_file = f"output_data/shap_{day}_{datetime.now().strftime('%d-%m-%Y_%H-%M')}.xlsx" + self.workbook.save(excel_file) diff --git a/Kitsune.py b/Kitsune.py index 08ebc8c..e7d6c66 100644 --- a/Kitsune.py +++ b/Kitsune.py @@ -1,5 +1,6 @@ from FeatureExtractor import * from KitNET.KitNET import KitNET +import numpy as np # MIT License # @@ -24,7 +25,7 @@ # SOFTWARE. class Kitsune: - def __init__(self,file_path,limit,max_autoencoder_size=10,FM_grace_period=None,AD_grace_period=10000,learning_rate=0.1,hidden_ratio=0.75,): + def __init__(self,file_path,limit,max_autoencoder_size=10,FM_grace_period=None,AD_grace_period=10000,learning_rate=0.1,hidden_ratio=0.75): #init packet feature extractor (AfterImage) self.FE = FE(file_path,limit) @@ -40,3 +41,16 @@ def proc_next_packet(self): # process KitNET return self.AnomDetector.process(x) # will train during the grace periods, then execute on all the rest. + def get_feature_list(self, csv=False, single=False): + vectorList = self.FE.get_all_vectors(csv, single) + return vectorList + + def feed_batch(self, data): + resultList = [] + count = 0 + for instance in data: + if count % 1000 == 0: + print("processing packet ", count, " / ", len(data)) + resultList.append(self.AnomDetector.process(instance)) + count += 1 + return np.array(resultList) diff --git a/example.py b/example.py index 7ea9c63..af80063 100644 --- a/example.py +++ b/example.py @@ -1,71 +1,264 @@ -from Kitsune import Kitsune +import csv +import math +import pickle +from math import floor + import numpy as np -import time - -############################################################################## -# Kitsune a lightweight online network intrusion detection system based on an ensemble of autoencoders (kitNET). -# For more information and citation, please see our NDSS'18 paper: Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection - -# This script demonstrates Kitsune's ability to incrementally learn, and detect anomalies in recorded a pcap of the Mirai Malware. -# The demo involves an m-by-n dataset with n=115 dimensions (features), and m=100,000 observations. -# Each observation is a snapshot of the network's state in terms of incremental damped statistics (see the NDSS paper for more details) - -#The runtimes presented in the paper, are based on the C++ implimentation (roughly 100x faster than the python implimentation) -################### Last Tested with Anaconda 3.6.3 ####################### - -# Load Mirai pcap (a recording of the Mirai botnet malware being activated) -# The first 70,000 observations are clean... -print("Unzipping Sample Capture...") -import zipfile -with zipfile.ZipFile("mirai.zip","r") as zip_ref: - zip_ref.extractall() - - -# File location -path = "mirai.pcap" #the pcap, pcapng, or tsv file to process. -packet_limit = np.Inf #the number of packets to process - -# KitNET params: -maxAE = 10 #maximum size for any autoencoder in the ensemble layer -FMgrace = 5000 #the number of instances taken to learn the feature mapping (the ensemble's architecture) -ADgrace = 50000 #the number of instances used to train the anomaly detector (ensemble itself) - -# Build Kitsune -K = Kitsune(path,packet_limit,maxAE,FMgrace,ADgrace) - -print("Running Kitsune:") -RMSEs = [] -i = 0 -start = time.time() -# Here we process (train/execute) each individual packet. -# In this way, each observation is discarded after performing process() method. -while True: - i+=1 - if i % 1000 == 0: - print(i) - rmse = K.proc_next_packet() - if rmse == -1: - break - RMSEs.append(rmse) -stop = time.time() -print("Complete. Time elapsed: "+ str(stop - start)) - - -# Here we demonstrate how one can fit the RMSE scores to a log-normal distribution (useful for finding/setting a cutoff threshold \phi) -from scipy.stats import norm -benignSample = np.log(RMSEs[FMgrace+ADgrace+1:100000]) -logProbs = norm.logsf(np.log(RMSEs), np.mean(benignSample), np.std(benignSample)) - -# plot the RMSE anomaly scores -print("Plotting results") -from matplotlib import pyplot as plt -from matplotlib import cm -plt.figure(figsize=(10,5)) -fig = plt.scatter(range(FMgrace+ADgrace+1,len(RMSEs)),RMSEs[FMgrace+ADgrace+1:],s=0.1,c=logProbs[FMgrace+ADgrace+1:],cmap='RdYlGn') -plt.yscale("log") -plt.title("Anomaly Scores from Kitsune's Execution Phase") -plt.ylabel("RMSE (log scaled)") -plt.xlabel("Time elapsed [min]") -figbar=plt.colorbar() -figbar.ax.set_ylabel('Log Probability\n ', rotation=270) -plt.show() +from scapy.utils import rdpcap +from random import sample +import matplotlib.pyplot as plt +import matplotlib.pyplot as plt2 + +from openpyxl import Workbook +from openpyxl.chart import BarChart, Reference + +from KitPlugin import KitPlugin + +inputs = { + "mirai_malicious": { + "input_path": "input_data/mirai.pcap", + "input_path_test": "input_data/mirai.pcap", + "packet_limit": 200000, + "maxAE": 10, + "FMgrace": 5000, + "ADgrace": 50000, + "training_min": 0, + "training_max": 60000, + "testing_min": 140330, + "testing_max": 140355 + }, + "mirai_benign": { + "input_path": "input_data/mirai.pcap", + "input_path_test": "input_data/mirai.pcap", + "packet_limit": 200000, + "maxAE": 10, + "FMgrace": 5000, + "ADgrace": 50000, + "training_min": 0, + "training_max": 60000, + "testing_min": 70000, + "testing_max": 70025 + } +} + + +# KitPlugin = KitPlugin() +# KitPlugin.hyper_opt("input_data/Monday-WorkingHours_10_percent_random.pcap", 100, 1000000) + +# Run series of statistics +# KitPlugin.run_series_stats(inputs) +# Get feature list from pickle file +# KitPlugin.feature_loader() +# Train Kitsune on the training data +# KitPlugin.kit_trainer(training_min, training_max) +# Calculate SHAP-values +# KitPlugin.shap_values_builder(training_min, training_max, testing_min, testing_max) +# Pickle SHAP-values +# KitPlugin.shap_values_pickle() +# Load SHAP-values +# KitPlugin.shap_values_loader() +# Calculate shap summary statistics +# KitPlugin.shap_stats_summary_builder(testing_min, testing_max) +# KitPlugin.shap_stats_excel_export() + +# Calculate EER and AUC values +# KitPlugin = KitPlugin(input_path="input_data/mirai.pcap", packet_limit=200000, num_autenc=10, FMgrace=5000, ADgrace=50000, learning_rate=0.1, hidden_ratio=0.75) +# KitPlugin.feature_loader() +# KitPlugin.kit_trainer(0, 60000) +# KitPlugin.model_pickle() +# ONLY run this on a mixed batch of benign/malicious samples +# RMSEs = KitPlugin.kit_runner(120000, 122000, normalize=True) +# Labels +# Create an array of zeros with 1622 entries +# zeros = np.zeros(1622) +# Create an array of ones with 378 entries +# ones = np.ones(378) +# Concatenate the two arrays to get the final array +# labels = np.concatenate((zeros, ones)) +# KitPlugin.calc_auc_eer(RMSEs, labels) + +# KitPlugin = KitPlugin() +# Random sample of 500000 packets, ordered by timestamp +# KitPlugin.random_sample_pcap("input_data/Monday-WorkingHours.pcap", "input_data/Monday-WorkingHours_500k.pcap", 500000) +# KitPlugin.random_sample_pcap("input_data/Monday-WorkingHours.pcap", "input_data/Monday-WorkingHours_1M.pcap", 1000000) + +# Out of every 1000 packets, it only keeps the first 100; so, only 10% of packets is kept. It will then do the same for the next 1000 packets +# KitPlugin.interval_sample_pcap("input_data/Monday-WorkingHours.pcap", "input_data/Monday-WorkingHours_10_percent.pcap", 10) + +# Sample 10 percent of conversations +# SampleKitPlugin = KitPlugin() +##conversations = SampleKitPlugin.sample_percentage_conversations(10, "input_data/Monday_Split/17_01-18_01.pcapng", "input_data/Monday_Split/17_01-18_01-sample-10.pcap") +# conversations = SampleKitPlugin.conversations_loader('pickles/17_01-18_01_sample_10_conv') +# packets = rdpcap('input_data/Monday_Split/17_01-18_01-sample-10.pcap') +# features = SampleKitPlugin.load_pcap_to_features('input_data/Monday_Split/17_01-18_01-sample-10.pcap') + +# print('conversations: '+str(len(conversations))) +# print('packets: '+str(len(packets))) +# print('feature lists: '+str(len(features))) + +# del SampleKitPlugin + +# NewKitPlugin = KitPlugin('input_data/Monday_Split/17_01-18_01-sample-10.pcap', num_autenc=6, FMgrace=int(0.05*len(features)), int(0.95*len(ADgrace))) + +def kitTester(day, attack_type): + from KitPlugin import KitPlugin + kitplugin = KitPlugin() + # print('reading labels file') + # labels = kitplugin.read_label_file(f'input_data/attack_types/{day}_{attack_type}.csv') + # iter = 0 + # for label in labels: + # if iter == 0: + # iter += 1 + # continue + # label.append(str(labels.index(label))) + # print('sampling packets by conversation') + # kitplugin.sample_packets_by_conversation(f'input_data/{day.title()}-WorkingHours.pcap.tsv', + # f'input_data/attack_types/{day}_{attack_type}.pcap.tsv', labels) + # + # # Map samples to features of an existing featureList + # kitplugin.map_packets_to_features(f'input_data/attack_types/{day}_{attack_type}.pcap.tsv', + # f'input_data/attack_types/{day}_features.csv', + # f'input_data/attack_types/{day}_features_{attack_type}.csv') + results = kitplugin.run_trained_kitsune_from_feature_csv( + f"input_data/attack_types/{day}_features_{attack_type}.csv", 0, np.Inf) + #plt.title(f'{day.title()}_{attack_type}') + #plt.plot(results) + #plt.show() + with open(f'pickles/output_pickles_packet_basis/{day.title()}_{attack_type}_results.pkl', 'wb') as f: + pickle.dump(results, f) + + convs = kitplugin.map_results_to_conversation(results, f"input_data/attack_types/{day}_{attack_type}.pcap.tsv") + print(f"attack: {attack_type}, convs: {len(convs)}") + maxConvs = [] + for conv in convs: + maxConvs.append(np.max(convs[conv])) + + path = f'pickles/output_pickles_conv_basis/{day.title()}_{attack_type}_maxConvs.pkl' + with open(path, 'wb') as f: + pickle.dump(maxConvs, f) + + return maxConvs + +#kitplugin = KitPlugin(input_path="input_data/Monday-WorkingHours.pcap.tsv", packet_limit=np.Inf, num_autenc=50, FMgrace=None, ADgrace=None, learning_rate=0.1, hidden_ratio=0.75) +#kitplugin.feature_builder("input_data/attack_types/monday_features.csv") + +def plot_attack_boxplots(data_for_attack_types, include_outliers=True, log_scale=False): + attack_types = list(data_for_attack_types.keys()) + data = list(data_for_attack_types.values()) + + # Storing information about actions taken + actions_taken = [] + + if log_scale: + data = [[val if val <= 0 else val for val in sublist] for sublist in data] + actions_taken.append('Log Scale') + + plt.figure(figsize=(10, 6)) + + # Set showfliers parameter based on include_outliers argument + show_outliers = 'outliers' if include_outliers else False + if include_outliers: + actions_taken.append("outliers included") + else: + actions_taken.append("no outliers included") + + # Create boxplots + bp = plt.boxplot(data, showfliers=show_outliers) + + # Add sample count annotations to each boxplot + for i, box in enumerate(bp['boxes']): + # Calculate the number of samples for each boxplot + sample_count = len(data[i]) + + # Position the text annotation slightly above the boxplot + conv_count = 0 + if len(data[i]) > 0: + conv_count = max(data[i]) + plt.text(i + 1, conv_count, f'{sample_count} samples', ha='center', va='bottom') + + plt.xticks(range(1, len(attack_types) + 1), attack_types, rotation=45) + plt.xlabel('Attack Types') + plt.ylabel('Values') + + # Include information about actions taken in the title + title = 'Boxplots for Attack Types' + if actions_taken: + title += f' ({", ".join(actions_taken)})' + + plt.title(title) + + # Apply log scale if log_scale argument is True + if log_scale: + plt.yscale('log') + + plt.tight_layout() + plt.figure(figsize=(14, 10)) + plt.show() + +def create_attack_barchart_excel(data_for_attack_types): + attack_types = list(data_for_attack_types.keys()) + data = list(data_for_attack_types.values()) + + # Create a workbook and add a worksheet + wb = Workbook() + ws = wb.active + ws.title = "Attack Bar Chart" + + # Write attack types to the first column + for row, attack_type in enumerate(attack_types, start=1): + ws.cell(row=row, column=1, value=attack_type) + + # Write data to worksheet + for col, attack_data in enumerate(data, start=2): + for row, value in enumerate(attack_data, start=1): + ws.cell(row=row, column=col, value=value) + + # Create a bar chart + chart = BarChart() + + # Set chart title and axis labels + chart.title = "Bar Chart for Attack Types" + chart.x_axis.title = 'Attack Types' + chart.y_axis.title = 'Values' + + # Set data for the chart + values = Reference(ws, min_col=2, min_row=1, max_col=len(attack_types) + 1, max_row=len(attack_types[0])) + categories = Reference(ws, min_col=1, min_row=2, max_row=len(attack_types) + 1) + chart.add_data(values, titles_from_data=True) + chart.set_categories(categories) + + # Set the chart position + ws.add_chart(chart, "E5") + + # Save the workbook + wb.save("attack_barchart.xlsx") + +# attacks1 = ["Infiltration - Portscan"] +# convs = [] +# for attack in attacks1: +# print(attack) +# convs.append(kitTester("thursday", attack)) +# +# attacks2 = ["benign - small", "FTP-Patator", "FTP-Patator - Attempted", "SSH-Patator", "SSH-Patator - Attempted"] +# for attack in attacks2: +# print(attack) +# convs.append(kitTester("tuesday", attack)) + + +# attacks = attacks1 + attacks2 +# attack_dict = {attack: conv for attack, conv in zip(attacks, convs)} +# plot_attack_boxplots(attack_dict, include_outliers=False, log_scale=False) +# plot_attack_boxplots(attack_dict, include_outliers=False, log_scale=True) +# plot_attack_boxplots(attack_dict, include_outliers=True, log_scale=False) +# plot_attack_boxplots(attack_dict, include_outliers=True, log_scale=True) +# create_attack_barchart_excel(attack_dict) +#kitplugin = KitPlugin(input_path="input_data/Monday-WorkingHours.pcap", packet_limit=np.Inf, num_autenc=50, FMgrace=None, ADgrace=None, learning_rate=0.1, hidden_ratio=0.75) +#kitplugin.feature_builder("input_data/attack_types/monday_features_test.csv", True) + +kitplugin = KitPlugin() +#kitplugin.most_significant_packets_sampler("thursday", 0.2667368034640465) +results = kitplugin.shap_documenter("thursday") +# kitplugin.most_significant_packets_sampler("tuesday", 0.2667368034640465) +#results = kitplugin.shap_documenter("wednesday") + +#kitplugin.hyper_opt_KitNET("monday_features.csv") diff --git a/hyperopt.db b/hyperopt.db new file mode 100644 index 0000000..2452e47 Binary files /dev/null and b/hyperopt.db differ diff --git a/netStat.py b/netStat.py index a61a257..aa33d7b 100644 --- a/netStat.py +++ b/netStat.py @@ -70,12 +70,20 @@ def findDirection(self,IPtype,srcIP,dstIP,eth_src,eth_dst): #cpp: this is all gi return src_subnet, dst_subnet - def updateGetStats(self, IPtype, srcMAC,dstMAC, srcIP, srcProtocol, dstIP, dstProtocol, datagramSize, timestamp): + def updateGetStats(self, IPtype, srcMAC,dstMAC, srcIP, srcProtocol, dstIP, dstProtocol, datagramSize, timestamp, tcpFlags=False, payload = 0): # Host BW: Stats on the srcIP's general Sender Statistics # Hstat = np.zeros((3*len(self.Lambdas,))) # for i in range(len(self.Lambdas)): # Hstat[(i*3):((i+1)*3)] = self.HT_H.update_get_1D_Stats(srcIP, timestamp, datagramSize, self.Lambdas[i]) + if tcpFlags and tcpFlags == "": + return np.zeroes(8*len(self.Lambdas)) + if tcpFlags and tcpFlags != "": + # MAC.IP: Stats on src MAC-IP relationships + MIstat = np.zeros((8 * len(self.Lambdas, ))) + for i in range(len(self.Lambdas)): + MIstat[(i*8):((i+1)*8)] = self.HT_MI.update_get_1D_Stats(srcMAC + srcIP, timestamp, datagramSize, self.Lambdas[i], tcpFlags=tcpFlags) + return MIstat #MAC.IP: Stats on src MAC-IP relationships MIstat = np.zeros((3*len(self.Lambdas,))) for i in range(len(self.Lambdas)): diff --git a/summary_statistics_test.xlsx b/summary_statistics_test.xlsx new file mode 100644 index 0000000..3a0e756 Binary files /dev/null and b/summary_statistics_test.xlsx differ