STARTUP DATASET CREATION AND ANALYTICAL WORKFLOW USING PROC SQL | PROC MEANS | PROC CORR | PROC SGPLOT | MACROS WITH INTCK AND INTNX DATE FUNCTIONS
options nocenter;
1) Create startup table using PROC SQL with SAS date literals for Founded and Last Funding */
proc sql;
create table work.startups
(
Startup_Name char(40),
Industry char(30),
Funding num format=dollar12.,
Employees num,
Growth_Rate num format=8.2,
Country char(30),
Founded_Date num format=ddmmyy10.,
Last_Funding_Date num format=ddmmyy10.
);
quit;
LOG:
2) Inserting the company names,employees and more
proc sql;
insert into work.startups
select
"AeroScale Innovations", "Aerospace", 5500000, 45, 24.5, "United States",
input("15JAN2018", date9.) , input("10JUN2023", date9.)
from sashelp.class(obs=1)
union all select
"GreenLoop Energy", "Clean Energy", 3200000, 28, 18.0, "India",
input("02MAR2019", date9.) , input("01DEC2022", date9.)
from sashelp.class(obs=1)
union all select
"MediSoft Labs", "Healthcare IT", 12000000, 120, 32.1, "Canada",
input("20JUL2017", date9.) , input("15FEB2024", date9.)
from sashelp.class(obs=1)
union all select
"FinSight Analytics", "Fintech", 7500000, 60, 21.4, "United Kingdom",
input("05MAY2016", date9.) , input("20SEP2023", date9.)
from sashelp.class(obs=1)
union all select
"EduNext Platform", "EdTech", 1800000, 15, 12.8, "India",
input("19NOV2020", date9.) , input("11NOV2023", date9.)
from sashelp.class(obs=1)
union all select
"AgriSense Solutions", "AgriTech", 1400000, 10, 9.5, "Brazil",
input("01JAN2021", date9.) , input("01MAR2022", date9.)
from sashelp.class(obs=1)
union all select
"BlockWave Systems", "Blockchain", 4600000, 35, 27.0, "Singapore",
input("10OCT2018", date9.) , input("20AUG2023", date9.)
from sashelp.class(obs=1)
union all select
"RetailRover", "Ecommerce", 2500000, 22, 14.3, "United States",
input("23JUN2019", date9.) , input("05JUL2023", date9.)
from sashelp.class(obs=1)
union all select
"NanoHealth Bio", "Biotech", 25000000, 200, 38.9, "Germany",
input("12DEC2015", date9.) , input("28FEB2024", date9.)
from sashelp.class(obs=1)
union all select
"UrbanMobility Labs", "Mobility", 9000000, 80, 19.7, "Netherlands",
input("30SEP2016", date9.) , input("10JAN2024", date9.)
from sashelp.class(obs=1)
union all select
"CyberGuard AI", "Cybersecurity", 6100000, 52, 23.2, "Israel",
input("07APR2017", date9.) , input("02NOV2023", date9.)
from sashelp.class(obs=1)
union all select
"FoodLoop Delivery", "Logistics", 2100000, 18, 11.6, "India",
input("14AUG2020", date9.) , input("15AUG2022", date9.)
from sashelp.class(obs=1);
quit;
proc print data=work.startups label noobs;
var Startup_Name Industry Funding Employees Growth_Rate Country Founded_Date Last_Funding_Date;
format Funding dollar12. Founded_Date ddmmyy10. Last_Funding_Date ddmmyy10.;
title "Initial Startup Table (12 Observations)";
run;
OUTPUT:
| Startup_Name | Industry | Funding | Employees | Growth_Rate | Country | Founded_Date | Last_Funding_Date |
|---|---|---|---|---|---|---|---|
| AeroScale Innovations | Aerospace | $5,500,000 | 45 | 24.50 | United States | 15/01/2018 | 10/06/2023 |
| GreenLoop Energy | Clean Energy | $3,200,000 | 28 | 18.00 | India | 02/03/2019 | 01/12/2022 |
| MediSoft Labs | Healthcare IT | $12,000,000 | 120 | 32.10 | Canada | 20/07/2017 | 15/02/2024 |
| FinSight Analytics | Fintech | $7,500,000 | 60 | 21.40 | United Kingdom | 05/05/2016 | 20/09/2023 |
| EduNext Platform | EdTech | $1,800,000 | 15 | 12.80 | India | 19/11/2020 | 11/11/2023 |
| AgriSense Solutions | AgriTech | $1,400,000 | 10 | 9.50 | Brazil | 01/01/2021 | 01/03/2022 |
| BlockWave Systems | Blockchain | $4,600,000 | 35 | 27.00 | Singapore | 10/10/2018 | 20/08/2023 |
| RetailRover | Ecommerce | $2,500,000 | 22 | 14.30 | United States | 23/06/2019 | 05/07/2023 |
| NanoHealth Bio | Biotech | $25,000,000 | 200 | 38.90 | Germany | 12/12/2015 | 28/02/2024 |
| UrbanMobility Labs | Mobility | $9,000,000 | 80 | 19.70 | Netherlands | 30/09/2016 | 10/01/2024 |
| CyberGuard AI | Cybersecurity | $6,100,000 | 52 | 23.20 | Israel | 07/04/2017 | 02/11/2023 |
| FoodLoop Delivery | Logistics | $2,100,000 | 18 | 11.60 | India | 14/08/2020 | 15/08/2022 |
3) Compute numeric summaries using PROC MEANS
title "Summary Statistics (Funding, Employees, Growth_Rate)";
proc means data=work.startups n mean median std min max maxdec=2;
var Funding Employees Growth_Rate;
run;
OUTPUT:
The MEANS Procedure
| Variable | N | Mean | Median | Std Dev | Minimum | Maximum |
|---|---|---|---|---|---|---|
Funding Employees Growth_Rate | 12 12 12 | 6725000.00 57.08 21.08 | 5050000.00 40.00 20.55 | 6593815.97 54.91 8.74 | 1400000.00 10.00 9.50 | 25000000.00 200.00 38.90 |
4) Compute tertiles (33rd and 66th percentiles) for Growth_Rate using PROC UNIVARIATE
proc univariate data=work.startups noprint;
var Growth_Rate;
output out=work.growth_pcts pctlpts=33 66 pctlpre=P_;
run;
proc print data=work.growth_pcts;
run;
OUTPUT:
| Obs | P_33 | P_66 |
|---|---|---|
| 1 | 14.3 | 23.2 |
5) Macro to calculate company age and next funding anniversary using INTCK and INTNX
%macro compute_dates(in=work.startups, out=work.startups_dates);
data &out;
set ∈
/* Age in completed years */
Company_Age_Years = intck('year', Founded_Date, today());
/* Age with fractional years (approx) using days */
Company_Age_Days = intck('day', Founded_Date, today());
Company_Age_Years_frac = Company_Age_Days / 365.25;
/* Next funding anniversary after Last_Funding_Date: align to next year boundary */
Next_Funding_Anniversary = intnx('year', Last_Funding_Date, 1, 'same');
/* Compute days until next funding anniversary */
Days_to_Next_Anniv = intck('day', today(), Next_Funding_Anniversary);
format Founded_Date ddmmyy10. Last_Funding_Date ddmmyy10. Next_Funding_Anniversary ddmmyy10.;
run;
/* Quick sanity checks */
proc print data=&out (obs=12);
var Startup_Name Founded_Date Company_Age_Years Company_Age_Years_frac Last_Funding_Date Next_Funding_Anniversary Days_to_Next_Anniv;
title "Company Age and Next Funding Anniversary";
run;
%mend compute_dates;
%compute_dates(in=work.startups, out=work.startups_dates);
OUTPUT:
| Obs | Startup_Name | Founded_Date | Company_Age_Years | Company_Age_Years_frac | Last_Funding_Date | Next_Funding_Anniversary | Days_to_Next_Anniv |
|---|---|---|---|---|---|---|---|
| 1 | AeroScale Innovations | 15/01/2018 | 7 | 7.90144 | 10/06/2023 | 10/06/2024 | -548 |
| 2 | GreenLoop Energy | 02/03/2019 | 6 | 6.77618 | 01/12/2022 | 01/12/2023 | -740 |
| 3 | MediSoft Labs | 20/07/2017 | 8 | 8.39151 | 15/02/2024 | 15/02/2025 | -298 |
| 4 | FinSight Analytics | 05/05/2016 | 9 | 9.59890 | 20/09/2023 | 20/09/2024 | -446 |
| 5 | EduNext Platform | 19/11/2020 | 5 | 5.05681 | 11/11/2023 | 11/11/2024 | -394 |
| 6 | AgriSense Solutions | 01/01/2021 | 4 | 4.93908 | 01/03/2022 | 01/03/2023 | -1015 |
| 7 | BlockWave Systems | 10/10/2018 | 7 | 7.16769 | 20/08/2023 | 20/08/2024 | -477 |
| 8 | RetailRover | 23/06/2019 | 6 | 6.46680 | 05/07/2023 | 05/07/2024 | -523 |
| 9 | NanoHealth Bio | 12/12/2015 | 10 | 9.99589 | 28/02/2024 | 28/02/2025 | -285 |
| 10 | UrbanMobility Labs | 30/09/2016 | 9 | 9.19370 | 10/01/2024 | 10/01/2025 | -334 |
| 11 | CyberGuard AI | 07/04/2017 | 8 | 8.67625 | 02/11/2023 | 02/11/2024 | -403 |
| 12 | FoodLoop Delivery | 14/08/2020 | 5 | 5.32238 | 15/08/2022 | 15/08/2023 | -848 |
6) Macro to classify Growth_Rate into tertiles (Low/Medium/High) using the percentiles computed earlier
%macro classify_growth(in=work.startups_dates, out=work.startups_cat);
/* read percentiles into macro variables */
proc sql noprint;
select P_33, P_66 into :p33 trimmed, :p66 trimmed from work.growth_pcts;
quit;
data &out;
set ∈
length Growth_Category $7;
if missing(Growth_Rate) then Growth_Category = 'Unknown';
else if Growth_Rate <= &p33 then Growth_Category = 'Low';
else if Growth_Rate <= &p66 then Growth_Category = 'Medium';
else Growth_Category = 'High';
run;
proc print data=&out;
run;
proc freq data=&out;
tables Growth_Category / nocum;
title "Growth Category Distribution (tertiles: &p33 and &p66)";
run;
%mend classify_growth;
%classify_growth(in=work.startups_dates, out=work.startups_cat);
OUTPUT:
| Obs | Startup_Name | Industry | Funding | Employees | Growth_Rate | Country | Founded_Date | Last_Funding_Date | Company_Age_Years | Company_Age_Days | Company_Age_Years_frac | Next_Funding_Anniversary | Days_to_Next_Anniv | Growth_Category |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | AeroScale Innovations | Aerospace | $5,500,000 | 45 | 24.50 | United States | 15/01/2018 | 10/06/2023 | 7 | 2886 | 7.90144 | 10/06/2024 | -548 | High |
| 2 | GreenLoop Energy | Clean Energy | $3,200,000 | 28 | 18.00 | India | 02/03/2019 | 01/12/2022 | 6 | 2475 | 6.77618 | 01/12/2023 | -740 | Medium |
| 3 | MediSoft Labs | Healthcare IT | $12,000,000 | 120 | 32.10 | Canada | 20/07/2017 | 15/02/2024 | 8 | 3065 | 8.39151 | 15/02/2025 | -298 | High |
| 4 | FinSight Analytics | Fintech | $7,500,000 | 60 | 21.40 | United Kingdom | 05/05/2016 | 20/09/2023 | 9 | 3506 | 9.59890 | 20/09/2024 | -446 | Medium |
| 5 | EduNext Platform | EdTech | $1,800,000 | 15 | 12.80 | India | 19/11/2020 | 11/11/2023 | 5 | 1847 | 5.05681 | 11/11/2024 | -394 | Low |
| 6 | AgriSense Solutions | AgriTech | $1,400,000 | 10 | 9.50 | Brazil | 01/01/2021 | 01/03/2022 | 4 | 1804 | 4.93908 | 01/03/2023 | -1015 | Low |
| 7 | BlockWave Systems | Blockchain | $4,600,000 | 35 | 27.00 | Singapore | 10/10/2018 | 20/08/2023 | 7 | 2618 | 7.16769 | 20/08/2024 | -477 | High |
| 8 | RetailRover | Ecommerce | $2,500,000 | 22 | 14.30 | United States | 23/06/2019 | 05/07/2023 | 6 | 2362 | 6.46680 | 05/07/2024 | -523 | Low |
| 9 | NanoHealth Bio | Biotech | $25,000,000 | 200 | 38.90 | Germany | 12/12/2015 | 28/02/2024 | 10 | 3651 | 9.99589 | 28/02/2025 | -285 | High |
| 10 | UrbanMobility Labs | Mobility | $9,000,000 | 80 | 19.70 | Netherlands | 30/09/2016 | 10/01/2024 | 9 | 3358 | 9.19370 | 10/01/2025 | -334 | Medium |
| 11 | CyberGuard AI | Cybersecurity | $6,100,000 | 52 | 23.20 | Israel | 07/04/2017 | 02/11/2023 | 8 | 3169 | 8.67625 | 02/11/2024 | -403 | Medium |
| 12 | FoodLoop Delivery | Logistics | $2,100,000 | 18 | 11.60 | India | 14/08/2020 | 15/08/2022 | 5 | 1944 | 5.32238 | 15/08/2023 | -848 | Low |
The FREQ Procedure
| Growth_Category | Frequency | Percent |
|---|---|---|
| High | 4 | 33.33 |
| Low | 4 | 33.33 |
| Medium | 4 | 33.33 |
7) Correlation analysis between Funding, Employees, Growth_Rate, and Company_Age_Years
title "Pearson Correlations";
proc corr data=work.startups_cat nosimple pearson;
var Funding Employees Growth_Rate Company_Age_Years;
run;
OUTPUT:
The CORR Procedure
| 4 Variables: | Funding Employees Growth_Rate Company_Age_Years |
|---|
| Pearson Correlation Coefficients, N = 12 Prob > |r| under H0: Rho=0 | ||||
|---|---|---|---|---|
| Funding | Employees | Growth_Rate | Company_Age_Years | |
| Funding | 1.00000 | 0.99236 <.0001 | 0.86541 0.0003 | 0.80409 0.0016 |
| Employees | 0.99236 <.0001 | 1.00000 | 0.87481 0.0002 | 0.80699 0.0015 |
| Growth_Rate | 0.86541 0.0003 | 0.87481 0.0002 | 1.00000 | 0.80925 0.0014 |
| Company_Age_Years | 0.80409 0.0016 | 0.80699 0.0015 | 0.80925 0.0014 | 1.00000 |
8) Visualizations: scatter, vbar and boxplot using PROC SGPLOT
title "Scatter: Funding vs Growth_Rate with Regression";
proc sgplot data=work.startups_cat;
scatter x=Funding y=Growth_Rate / markerattrs=(symbol=CircleFilled) datalabel=Startup_Name;
reg x=Funding y=Growth_Rate / nomarkers lineattrs=(pattern=1);
xaxis label="Funding (USD)";
yaxis label="Growth Rate (%)";
run;
OUTPUT:
title "Counts by Growth Category";
proc sgplot data=work.startups_cat;
vbar Growth_Category / datalabel;
xaxis label="Growth Category";
yaxis label="Count";
run;
OUTPUT:
title "Growth Rate by Industry (Boxplot)";
proc sgplot data=work.startups_cat;
vbox Growth_Rate / category=Industry;
yaxis label="Growth Rate (%)";
run;
OUTPUT:
9) Utility macro to list top N by a chosen metric
%macro top_n(data=work.startups_cat, var=Growth_Rate, n=5);
proc sort data=&data out=_top descending;
by &var;
run;
title "Top &n Startups by &var";
proc print data=_top (obs=&n);
var Startup_Name Industry &var Country Funding Employees Growth_Rate;
run;
%mend top_n;
%top_n(data=work.startups_cat, var=Growth_Rate, n=5);
OUTPUT:
| Obs | Startup_Name | Industry | Growth_Rate | Country | Funding | Employees | Growth_Rate |
|---|---|---|---|---|---|---|---|
| 1 | NanoHealth Bio | Biotech | 38.90 | Germany | $25,000,000 | 200 | 38.90 |
| 2 | MediSoft Labs | Healthcare IT | 32.10 | Canada | $12,000,000 | 120 | 32.10 |
| 3 | BlockWave Systems | Blockchain | 27.00 | Singapore | $4,600,000 | 35 | 27.00 |
| 4 | AeroScale Innovations | Aerospace | 24.50 | United States | $5,500,000 | 45 | 24.50 |
| 5 | CyberGuard AI | Cybersecurity | 23.20 | Israel | $6,100,000 | 52 | 23.20 |
YESTERDAY INVALID CODE ANSWER
4) PROC MEANS: Summary statistics on Sales and Rating
proc means data=work.video_games n mean median std min max mindec=2 maxdec=2;
var Sales Rating;
title "Summary statistics for Sales (million) and Rating";
run;
OUTPUT:
The MEANS Procedure
| Variable | N | Mean | Median | Std Dev | Minimum | Maximum |
|---|---|---|---|---|---|---|
Sales Rating | 12 12 | 80.63 8.62 | 29.50 8.95 | 103.30 0.85 | 6.50 7.00 | 350.00 9.70 |
No comments:
Post a Comment