CYBER ATTACKS DATASET CREATION AND ANALYSIS USING PROC SQL | PROC MEANS | PROC FREQ | PROC SGPLOT WITH MACRO-BASED RISK CLASSIFICATION AND DATE FUNCTIONS (INTCK | INTNX)
options nocenter;
1) CREATE RAW DATASET: work.cyber_attacks_raw
data work.cyber_attacks_raw;
infile datalines dlm='|' dsd;
informat Occurrence_Date date9. Discovery_Date date9.;
format Occurrence_Date date9. Discovery_Date date9.;
length Attack_Type $30 Country $30;
input Attack_Type $ Country $ Frequency Financial_Loss Occurrence_Date Discovery_Date
Vulnerability_Level;
/* compute detection time in days using INTCK (difference in days) */
Detection_Time_Days = intck('day', Occurrence_Date, Discovery_Date);
datalines;
Phishing|USA|120|1500|01JAN2025|03JAN2025|6
Ransomware|India|18|7500|15FEB2025|20FEB2025|9
DDoS|China|45|250|05MAR2025|05MAR2025|3
Insider Threat|Germany|6|300|12MAR2025|25MAR2025|8
Supply Chain|USA|2|12000|28FEB2025|12MAR2025|7
SQL Injection|Brazil|30|400|10JAN2025|15JAN2025|5
Zero-day Exploit|Russia|1|9000|22MAR2025|28MAR2025|9
Credential Stuffing|UK|85|600|07FEB2025|10FEB2025|4
Malware (Trojan)|South_Africa|25|1200|02MAR2025|06MAR2025|6
Business Email Compromise|Australia|9|2200|18JAN2025|30JAN2025|12
Man-in-the-Middle|Canada|12|150|09FEB2025|11FEB2025|2
Adware/Spyware|Mexico|40|80|20JAN2025|22JAN2025|3
;
run;
proc print data=work.cyber_attacks_raw label noobs;
title "CYBER ATTACKS RAW (12 OBS)";
var Attack_Type Country Frequency Financial_Loss Occurrence_Date Discovery_Date
Detection_Time_Days Vulnerability_Level;
run;
OUTPUT:
| Attack_Type | Country | Frequency | Financial_Loss | Occurrence_Date | Discovery_Date | Detection_Time_Days | Vulnerability_Level |
|---|---|---|---|---|---|---|---|
| Phishing | USA | 120 | 1500 | 01JAN2025 | 03JAN2025 | 2 | 6 |
| Ransomware | India | 18 | 7500 | 15FEB2025 | 20FEB2025 | 5 | 9 |
| DDoS | China | 45 | 250 | 05MAR2025 | 05MAR2025 | 0 | 3 |
| Insider Threat | Germany | 6 | 300 | 12MAR2025 | 25MAR2025 | 13 | 8 |
| Supply Chain | USA | 2 | 12000 | 28FEB2025 | 12MAR2025 | 12 | 7 |
| SQL Injection | Brazil | 30 | 400 | 10JAN2025 | 15JAN2025 | 5 | 5 |
| Zero-day Exploit | Russia | 1 | 9000 | 22MAR2025 | 28MAR2025 | 6 | 9 |
| Credential Stuffing | UK | 85 | 600 | 07FEB2025 | 10FEB2025 | 3 | 4 |
| Malware (Trojan) | South_Africa | 25 | 1200 | 02MAR2025 | 06MAR2025 | 4 | 6 |
| Business Email Compromise | Australia | 9 | 2200 | 18JAN2025 | 30JAN2025 | 12 | 12 |
| Man-in-the-Middle | Canada | 12 | 150 | 09FEB2025 | 11FEB2025 | 2 | 2 |
| Adware/Spyware | Mexico | 40 | 80 | 20JAN2025 | 22JAN2025 | 2 | 3 |
2) Use PROC SQL for derived tables / aggregations
/* Create a summary with average loss and mean detection time by Attack_Type */
proc sql;
create table work.attack_summary as
select Attack_Type,
count(*) as N,
mean(Frequency) format=8.2 as Mean_Freq,
mean(Financial_Loss) format=12.2 as Mean_Loss_KUSD,
mean(Detection_Time_Days) format=8.2 as Mean_Detect_Days,
mean(Vulnerability_Level) format=8.2 as Mean_Vuln
from work.cyber_attacks_raw
group by Attack_Type;
quit;
proc print data=work.attack_summary;
run;
OUTPUT:
| Obs | Attack_Type | N | Mean_Freq | Mean_Loss_KUSD | Mean_Detect_Days | Mean_Vuln |
|---|---|---|---|---|---|---|
| 1 | Adware/Spyware | 1 | 40.00 | 80.00 | 2.00 | 3.00 |
| 2 | Business Email Compromise | 1 | 9.00 | 2200.00 | 12.00 | 12.00 |
| 3 | Credential Stuffing | 1 | 85.00 | 600.00 | 3.00 | 4.00 |
| 4 | DDoS | 1 | 45.00 | 250.00 | 0.00 | 3.00 |
| 5 | Insider Threat | 1 | 6.00 | 300.00 | 13.00 | 8.00 |
| 6 | Malware (Trojan) | 1 | 25.00 | 1200.00 | 4.00 | 6.00 |
| 7 | Man-in-the-Middle | 1 | 12.00 | 150.00 | 2.00 | 2.00 |
| 8 | Phishing | 1 | 120.00 | 1500.00 | 2.00 | 6.00 |
| 9 | Ransomware | 1 | 18.00 | 7500.00 | 5.00 | 9.00 |
| 10 | SQL Injection | 1 | 30.00 | 400.00 | 5.00 | 5.00 |
| 11 | Supply Chain | 1 | 2.00 | 12000.00 | 12.00 | 7.00 |
| 12 | Zero-day Exploit | 1 | 1.00 | 9000.00 | 6.00 | 9.00 |
/* Top 5 highest financial loss events */
proc sql;
create table work.top_losses as
select * from work.cyber_attacks_raw
order by Financial_Loss desc
;
quit;
proc print data=work.top_losses;
run;
OUTPUT:
| Obs | Occurrence_Date | Discovery_Date | Attack_Type | Country | Frequency | Financial_Loss | Vulnerability_Level | Detection_Time_Days |
|---|---|---|---|---|---|---|---|---|
| 1 | 28FEB2025 | 12MAR2025 | Supply Chain | USA | 2 | 12000 | 7 | 12 |
| 2 | 22MAR2025 | 28MAR2025 | Zero-day Exploit | Russia | 1 | 9000 | 9 | 6 |
| 3 | 15FEB2025 | 20FEB2025 | Ransomware | India | 18 | 7500 | 9 | 5 |
| 4 | 18JAN2025 | 30JAN2025 | Business Email Compromise | Australia | 9 | 2200 | 12 | 12 |
| 5 | 01JAN2025 | 03JAN2025 | Phishing | USA | 120 | 1500 | 6 | 2 |
| 6 | 02MAR2025 | 06MAR2025 | Malware (Trojan) | South_Africa | 25 | 1200 | 6 | 4 |
| 7 | 07FEB2025 | 10FEB2025 | Credential Stuffing | UK | 85 | 600 | 4 | 3 |
| 8 | 10JAN2025 | 15JAN2025 | SQL Injection | Brazil | 30 | 400 | 5 | 5 |
| 9 | 12MAR2025 | 25MAR2025 | Insider Threat | Germany | 6 | 300 | 8 | 13 |
| 10 | 05MAR2025 | 05MAR2025 | DDoS | China | 45 | 250 | 3 | 0 |
| 11 | 09FEB2025 | 11FEB2025 | Man-in-the-Middle | Canada | 12 | 150 | 2 | 2 |
| 12 | 20JAN2025 | 22JAN2025 | Adware/Spyware | Mexico | 40 | 80 | 3 | 2 |
3) Macro: Risk classification (composite score)
%macro classify_risk(indata=work.cyber_attacks_raw, outdata=work.cyber_attacks_risk);
data &outdata.;
set &indata.;
if _n_=1 then do;
call symputx('minFreq', .);
call symputx('maxFreq', .);
call symputx('minLoss', .);
call symputx('maxLoss', .);
call symputx('minDetect', .);
call symputx('maxDetect', .);
call symputx('minVuln', .);
call symputx('maxVuln', .);
end;
n_Freq = min(max((Frequency - 0) / (200 - 0) * 100, 0), 100);
n_Loss = min(max((Financial_Loss - 0) / (20000 - 0) * 100, 0), 100);
/* For detection time: longer detection time increases risk => higher score */
n_Detect = min(max((Detection_Time_Days - 0) / (365 - 0) * 100, 0), 100);
/* Vulnerability level already 1-10, scale to 0-100 */
n_Vuln = min(max((Vulnerability_Level - 1) / (10 - 1) * 100, 0), 100);
/* Weighted composite score (weights sum to 1) */
/* Weights: Loss 0.35, Vulnerability 0.25, Frequency 0.20, Detection 0.20 */
Risk_Score = (0.20 * n_Freq) + (0.35 * n_Loss) + (0.20 * n_Detect) + (0.25 * n_Vuln);
format Risk_Score 8.2;
/* Risk categorization */
if Risk_Score < 25 then Risk_Cat = 'Low';
else if 25 <= Risk_Score < 50 then Risk_Cat = 'Medium';
else if 50 <= Risk_Score < 75 then Risk_Cat = 'High';
else Risk_Cat = 'Critical';
/* Demonstrate INTCK / INTNX usage: compute months between occurrence and today,
and month-start of discovery */
Months_Since_Occurrence = intck('month', Occurrence_Date, today());
Discovery_MonthStart = intnx('month', Discovery_Date, 0, 'beginning');
format Discovery_MonthStart date9.;
run;
%mend classify_risk;
%classify_risk(indata=work.cyber_attacks_raw, outdata=work.cyber_attacks_risk);
proc print data=work.cyber_attacks_risk noobs;
title "CYBER ATTACKS WITH RISK SCORES";
var Attack_Type Country Frequency Financial_Loss Occurrence_Date Discovery_Date
Detection_Time_Days Vulnerability_Level Risk_Score Risk_Cat Months_Since_Occurrence
Discovery_MonthStart;
run;
OUTPUT:
| Attack_Type | Country | Frequency | Financial_Loss | Occurrence_Date | Discovery_Date | Detection_Time_Days | Vulnerability_Level | Risk_Score | Risk_Cat | Months_Since_Occurrence | Discovery_MonthStart |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Phishing | USA | 120 | 1500 | 01JAN2025 | 03JAN2025 | 2 | 6 | 28.62 | Med | 11 | 01JAN2025 |
| Ransomware | India | 18 | 7500 | 15FEB2025 | 20FEB2025 | 5 | 9 | 37.42 | Med | 10 | 01FEB2025 |
| DDoS | China | 45 | 250 | 05MAR2025 | 05MAR2025 | 0 | 3 | 10.49 | Low | 9 | 01MAR2025 |
| Insider Threat | Germany | 6 | 300 | 12MAR2025 | 25MAR2025 | 13 | 8 | 21.28 | Low | 9 | 01MAR2025 |
| Supply Chain | USA | 2 | 12000 | 28FEB2025 | 12MAR2025 | 12 | 7 | 38.52 | Med | 10 | 01MAR2025 |
| SQL Injection | Brazil | 30 | 400 | 10JAN2025 | 15JAN2025 | 5 | 5 | 15.09 | Low | 11 | 01JAN2025 |
| Zero-day Exploit | Russia | 1 | 9000 | 22MAR2025 | 28MAR2025 | 6 | 9 | 38.40 | Med | 9 | 01MAR2025 |
| Credential Stuffing | UK | 85 | 600 | 07FEB2025 | 10FEB2025 | 3 | 4 | 18.05 | Low | 10 | 01FEB2025 |
| Malware (Trojan) | South_Africa | 25 | 1200 | 02MAR2025 | 06MAR2025 | 4 | 6 | 18.71 | Low | 9 | 01MAR2025 |
| Business Email Compromise | Australia | 9 | 2200 | 18JAN2025 | 30JAN2025 | 12 | 12 | 30.41 | Med | 11 | 01JAN2025 |
| Man-in-the-Middle | Canada | 12 | 150 | 09FEB2025 | 11FEB2025 | 2 | 2 | 4.35 | Low | 10 | 01FEB2025 |
| Adware/Spyware | Mexico | 40 | 80 | 20JAN2025 | 22JAN2025 | 2 | 3 | 9.81 | Low | 11 | 01JAN2025 |
4) PROC MEANS for numeric summaries
proc means data=work.cyber_attacks_risk n mean median std min max;
var Frequency Financial_Loss Detection_Time_Days Vulnerability_Level Risk_Score
Months_Since_Occurrence;
title "NUMERIC SUMMARY - MEANS";
run;
OUTPUT:
The MEANS Procedure
| Variable | N | Mean | Median | Std Dev | Minimum | Maximum |
|---|---|---|---|---|---|---|
Frequency Financial_Loss Detection_Time_Days Vulnerability_Level Risk_Score Months_Since_Occurrence | 12 12 12 12 12 12 | 32.7500000 2931.67 5.5000000 6.1666667 22.5956754 10.0000000 | 21.5000000 900.0000000 4.5000000 6.0000000 19.9949201 10.0000000 | 36.2995743 4126.91 4.4415395 2.9797295 11.8995600 0.8528029 | 1.0000000 80.0000000 0 2.0000000 4.3498668 9.0000000 | 120.0000000 12000.00 13.0000000 12.0000000 38.5242009 11.0000000 |
5) PROC FREQ for categorical summaries
proc freq data=work.cyber_attacks_risk;
tables Risk_Cat Attack_Type*Risk_Cat / norow nocol nopercent;
title "FREQUENCY OF RISK CATEGORIES";
run;
OUTPUT:
The FREQ Procedure
| Risk_Cat | Frequency | Cumulative Frequency |
|---|---|---|
| Low | 7 | 7 |
| Med | 5 | 12 |
|
| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
6) PROC SGPLOT examples:
ods graphics / reset imagename='RiskBar' width=800 height=400;
proc sgplot data=work.cyber_attacks_risk;
vbar Risk_Cat / datalabel;
xaxis label="Risk Category";
yaxis label="Count";
title "COUNT OF EVENTS BY RISK CATEGORY";
run;
OUTPUT:
ods graphics / reset imagename='LossByAttack' width=900 height=450;
proc sgplot data=work.cyber_attacks_risk;
hbar Attack_Type / response=Financial_Loss stat=sum datalabel;
xaxis label="Total Financial Loss (K USD)";
title "TOTAL FINANCIAL LOSS BY ATTACK TYPE";
run;
OUTPUT:
ods graphics / reset imagename='RiskVsDetect' width=800 height=400;
proc sgplot data=work.cyber_attacks_risk;
scatter x=Detection_Time_Days y=Risk_Score / datalabel=Attack_Type markerattrs=(symbol=CircleFilled);
reg x=Detection_Time_Days y=Risk_Score;
xaxis label="Detection Time (days)";
yaxis label="Risk Score";
title "RISK SCORE VS DETECTION TIME";
run;
OUTPUT:
7) Examples of additional derived datasets
proc sql;
create table work.monthly_by_attack as
select intnx('month', Occurrence_Date, 0, 'beginning') as Month_Start format=date9.,
Attack_Type,
sum(Financial_Loss) as Total_Loss_KUSD,
sum(Frequency) as Total_Freq,
mean(Risk_Score) as Avg_Risk_Score
from work.cyber_attacks_risk
group by calculated Month_Start, Attack_Type
order by calculated Month_Start, Attack_Type;
quit;
proc print data=work.monthly_by_attack noobs;
title "MONTHLY AGGREGATION BY ATTACK TYPE";
run;
| Month_Start | Attack_Type | Total_Loss_KUSD | Total_Freq | Avg_Risk_Score |
|---|---|---|---|---|
| 01JAN2025 | Adware/Spyware | 80 | 40 | 9.8051 |
| 01JAN2025 | Business Email Compromise | 2200 | 9 | 30.4075 |
| 01JAN2025 | Phishing | 1500 | 120 | 28.6235 |
| 01JAN2025 | SQL Injection | 400 | 30 | 15.0851 |
| 01FEB2025 | Credential Stuffing | 600 | 85 | 18.0477 |
| 01FEB2025 | Man-in-the-Middle | 150 | 12 | 4.3499 |
| 01FEB2025 | Ransomware | 7500 | 18 | 37.4212 |
| 01FEB2025 | Supply Chain | 12000 | 2 | 38.5242 |
| 01MAR2025 | DDoS | 250 | 45 | 10.4931 |
| 01MAR2025 | Insider Threat | 300 | 6 | 21.2818 |
| 01MAR2025 | Malware (Trojan) | 1200 | 25 | 18.7081 |
| 01MAR2025 | Zero-day Exploit | 9000 | 1 | 38.4010 |
No comments:
Post a Comment