Tuesday, 9 December 2025

334.STARTUP DATASET CREATION AND ANALYTICAL WORKFLOW USING PROC SQL | PROC MEANS | PROC CORR | PROC SGPLOT | MACROS WITH INTCK AND INTNX DATE FUNCTIONS

STARTUP DATASET CREATION AND ANALYTICAL WORKFLOW USING PROC SQL | PROC MEANS | PROC CORR | PROC SGPLOT | MACROS WITH INTCK AND INTNX DATE FUNCTIONS

 options nocenter;

1) Create startup table using PROC SQL with SAS date literals for Founded and Last Funding */

proc sql;

    create table work.startups

    (

        Startup_Name        char(40),

        Industry            char(30),

        Funding             num format=dollar12.,

        Employees           num,

        Growth_Rate         num format=8.2,

        Country             char(30),

        Founded_Date        num format=ddmmyy10.,

        Last_Funding_Date   num format=ddmmyy10.

    );

quit;

LOG:

NOTE: Table WORK.STARTUPS created, with 0 rows and 8 columns.

2) Inserting the company names,employees and more

proc sql;

    insert into work.startups

    select 

        "AeroScale Innovations", "Aerospace", 5500000, 45, 24.5, "United States",

        input("15JAN2018", date9.) , input("10JUN2023", date9.)

    from sashelp.class(obs=1)


    union all select

        "GreenLoop Energy", "Clean Energy", 3200000, 28, 18.0, "India",

        input("02MAR2019", date9.) , input("01DEC2022", date9.)

    from sashelp.class(obs=1)


    union all select

        "MediSoft Labs", "Healthcare IT", 12000000, 120, 32.1, "Canada",

        input("20JUL2017", date9.) , input("15FEB2024", date9.)

    from sashelp.class(obs=1)


    union all select

        "FinSight Analytics", "Fintech", 7500000, 60, 21.4, "United Kingdom",

        input("05MAY2016", date9.) , input("20SEP2023", date9.)

    from sashelp.class(obs=1)


    union all select

        "EduNext Platform", "EdTech", 1800000, 15, 12.8, "India",

        input("19NOV2020", date9.) , input("11NOV2023", date9.)

    from sashelp.class(obs=1)


    union all select

        "AgriSense Solutions", "AgriTech", 1400000, 10, 9.5, "Brazil",

        input("01JAN2021", date9.) , input("01MAR2022", date9.)

    from sashelp.class(obs=1)


    union all select

        "BlockWave Systems", "Blockchain", 4600000, 35, 27.0, "Singapore",

        input("10OCT2018", date9.) , input("20AUG2023", date9.)

    from sashelp.class(obs=1)


    union all select

        "RetailRover", "Ecommerce", 2500000, 22, 14.3, "United States",

        input("23JUN2019", date9.) , input("05JUL2023", date9.)

    from sashelp.class(obs=1)


    union all select

        "NanoHealth Bio", "Biotech", 25000000, 200, 38.9, "Germany",

        input("12DEC2015", date9.) , input("28FEB2024", date9.)

    from sashelp.class(obs=1)


    union all select

        "UrbanMobility Labs", "Mobility", 9000000, 80, 19.7, "Netherlands",

        input("30SEP2016", date9.) , input("10JAN2024", date9.)

    from sashelp.class(obs=1)


    union all select

        "CyberGuard AI", "Cybersecurity", 6100000, 52, 23.2, "Israel",

        input("07APR2017", date9.) , input("02NOV2023", date9.)

    from sashelp.class(obs=1)


    union all select

        "FoodLoop Delivery", "Logistics", 2100000, 18, 11.6, "India",

        input("14AUG2020", date9.) , input("15AUG2022", date9.)

    from sashelp.class(obs=1);

quit;


proc print data=work.startups label noobs;

  var Startup_Name Industry Funding Employees Growth_Rate Country Founded_Date Last_Funding_Date;

  format Funding dollar12. Founded_Date ddmmyy10. Last_Funding_Date ddmmyy10.;

  title "Initial Startup Table (12 Observations)";

run;

OUTPUT:

Initial Startup Table (12 Observations)

Startup_NameIndustryFundingEmployeesGrowth_RateCountryFounded_DateLast_Funding_Date
AeroScale InnovationsAerospace$5,500,0004524.50United States15/01/201810/06/2023
GreenLoop EnergyClean Energy$3,200,0002818.00India02/03/201901/12/2022
MediSoft LabsHealthcare IT$12,000,00012032.10Canada20/07/201715/02/2024
FinSight AnalyticsFintech$7,500,0006021.40United Kingdom05/05/201620/09/2023
EduNext PlatformEdTech$1,800,0001512.80India19/11/202011/11/2023
AgriSense SolutionsAgriTech$1,400,000109.50Brazil01/01/202101/03/2022
BlockWave SystemsBlockchain$4,600,0003527.00Singapore10/10/201820/08/2023
RetailRoverEcommerce$2,500,0002214.30United States23/06/201905/07/2023
NanoHealth BioBiotech$25,000,00020038.90Germany12/12/201528/02/2024
UrbanMobility LabsMobility$9,000,0008019.70Netherlands30/09/201610/01/2024
CyberGuard AICybersecurity$6,100,0005223.20Israel07/04/201702/11/2023
FoodLoop DeliveryLogistics$2,100,0001811.60India14/08/202015/08/2022

3) Compute numeric summaries using PROC MEANS 

title "Summary Statistics (Funding, Employees, Growth_Rate)";

proc means data=work.startups n mean median std min max maxdec=2;

  var Funding Employees Growth_Rate;

run;

OUTPUT:

Summary Statistics (Funding, Employees, Growth_Rate)

The MEANS Procedure

VariableNMeanMedianStd DevMinimumMaximum
Funding
Employees
Growth_Rate
12
12
12
6725000.00
57.08
21.08
5050000.00
40.00
20.55
6593815.97
54.91
8.74
1400000.00
10.00
9.50
25000000.00
200.00
38.90

4) Compute tertiles (33rd and 66th percentiles) for Growth_Rate using PROC UNIVARIATE 

proc univariate data=work.startups noprint;

  var Growth_Rate;

  output out=work.growth_pcts pctlpts=33 66 pctlpre=P_;

run;

proc print data=work.growth_pcts;

run;

OUTPUT:

ObsP_33P_66
114.323.2

5) Macro to calculate company age and next funding anniversary using INTCK and INTNX 

%macro compute_dates(in=work.startups, out=work.startups_dates);

  data &out;

    set ∈

    /* Age in completed years */

    Company_Age_Years = intck('year', Founded_Date, today());

    /* Age with fractional years (approx) using days */

    Company_Age_Days  = intck('day', Founded_Date, today());

    Company_Age_Years_frac = Company_Age_Days / 365.25;


    /* Next funding anniversary after Last_Funding_Date: align to next year boundary */

    Next_Funding_Anniversary = intnx('year', Last_Funding_Date, 1, 'same');

    /* Compute days until next funding anniversary */

    Days_to_Next_Anniv = intck('day', today(), Next_Funding_Anniversary);


    format Founded_Date ddmmyy10. Last_Funding_Date ddmmyy10. Next_Funding_Anniversary ddmmyy10.;

  run;


  /* Quick sanity checks */

  proc print data=&out (obs=12);

    var Startup_Name Founded_Date Company_Age_Years Company_Age_Years_frac Last_Funding_Date Next_Funding_Anniversary Days_to_Next_Anniv;

    title "Company Age and Next Funding Anniversary";

  run;

%mend compute_dates;


%compute_dates(in=work.startups, out=work.startups_dates);

OUTPUT:

Company Age and Next Funding Anniversary

ObsStartup_NameFounded_DateCompany_Age_YearsCompany_Age_Years_fracLast_Funding_DateNext_Funding_AnniversaryDays_to_Next_Anniv
1AeroScale Innovations15/01/201877.9014410/06/202310/06/2024-548
2GreenLoop Energy02/03/201966.7761801/12/202201/12/2023-740
3MediSoft Labs20/07/201788.3915115/02/202415/02/2025-298
4FinSight Analytics05/05/201699.5989020/09/202320/09/2024-446
5EduNext Platform19/11/202055.0568111/11/202311/11/2024-394
6AgriSense Solutions01/01/202144.9390801/03/202201/03/2023-1015
7BlockWave Systems10/10/201877.1676920/08/202320/08/2024-477
8RetailRover23/06/201966.4668005/07/202305/07/2024-523
9NanoHealth Bio12/12/2015109.9958928/02/202428/02/2025-285
10UrbanMobility Labs30/09/201699.1937010/01/202410/01/2025-334
11CyberGuard AI07/04/201788.6762502/11/202302/11/2024-403
12FoodLoop Delivery14/08/202055.3223815/08/202215/08/2023-848

6) Macro to classify Growth_Rate into tertiles (Low/Medium/High) using the percentiles computed earlier 

%macro classify_growth(in=work.startups_dates, out=work.startups_cat);

  /* read percentiles into macro variables */

  proc sql noprint;

    select P_33, P_66 into :p33 trimmed, :p66 trimmed from work.growth_pcts;

  quit;


  data &out;

    set ∈

    length Growth_Category $7;

    if missing(Growth_Rate) then Growth_Category = 'Unknown';

    else if Growth_Rate <= &p33 then Growth_Category = 'Low';

    else if Growth_Rate <= &p66 then Growth_Category = 'Medium';

    else Growth_Category = 'High';

  run;

  proc print data=&out;

  run;


  proc freq data=&out;

    tables Growth_Category / nocum;

    title "Growth Category Distribution (tertiles: &p33 and &p66)";

  run;

%mend classify_growth;


%classify_growth(in=work.startups_dates, out=work.startups_cat);

OUTPUT:

ObsStartup_NameIndustryFundingEmployeesGrowth_RateCountryFounded_DateLast_Funding_DateCompany_Age_YearsCompany_Age_DaysCompany_Age_Years_fracNext_Funding_AnniversaryDays_to_Next_AnnivGrowth_Category
1AeroScale InnovationsAerospace$5,500,0004524.50United States15/01/201810/06/2023728867.9014410/06/2024-548High
2GreenLoop EnergyClean Energy$3,200,0002818.00India02/03/201901/12/2022624756.7761801/12/2023-740Medium
3MediSoft LabsHealthcare IT$12,000,00012032.10Canada20/07/201715/02/2024830658.3915115/02/2025-298High
4FinSight AnalyticsFintech$7,500,0006021.40United Kingdom05/05/201620/09/2023935069.5989020/09/2024-446Medium
5EduNext PlatformEdTech$1,800,0001512.80India19/11/202011/11/2023518475.0568111/11/2024-394Low
6AgriSense SolutionsAgriTech$1,400,000109.50Brazil01/01/202101/03/2022418044.9390801/03/2023-1015Low
7BlockWave SystemsBlockchain$4,600,0003527.00Singapore10/10/201820/08/2023726187.1676920/08/2024-477High
8RetailRoverEcommerce$2,500,0002214.30United States23/06/201905/07/2023623626.4668005/07/2024-523Low
9NanoHealth BioBiotech$25,000,00020038.90Germany12/12/201528/02/20241036519.9958928/02/2025-285High
10UrbanMobility LabsMobility$9,000,0008019.70Netherlands30/09/201610/01/2024933589.1937010/01/2025-334Medium
11CyberGuard AICybersecurity$6,100,0005223.20Israel07/04/201702/11/2023831698.6762502/11/2024-403Medium
12FoodLoop DeliveryLogistics$2,100,0001811.60India14/08/202015/08/2022519445.3223815/08/2023-848Low

Growth Category Distribution (tertiles: 14.3 and 23.2)

The FREQ Procedure

Growth_CategoryFrequencyPercent
High433.33
Low433.33
Medium433.33

7) Correlation analysis between Funding, Employees, Growth_Rate, and Company_Age_Years 

title "Pearson Correlations";

proc corr data=work.startups_cat nosimple pearson;

  var Funding Employees Growth_Rate Company_Age_Years;

run;

OUTPUT:

Pearson Correlations

The CORR Procedure

4 Variables:Funding Employees Growth_Rate Company_Age_Years
Pearson Correlation Coefficients, N = 12
Prob > |r| under H0: Rho=0
 FundingEmployeesGrowth_RateCompany_Age_Years
Funding
1.00000
 
0.99236
<.0001
0.86541
0.0003
0.80409
0.0016
Employees
0.99236
<.0001
1.00000
 
0.87481
0.0002
0.80699
0.0015
Growth_Rate
0.86541
0.0003
0.87481
0.0002
1.00000
 
0.80925
0.0014
Company_Age_Years
0.80409
0.0016
0.80699
0.0015
0.80925
0.0014
1.00000
 

8) Visualizations: scatter, vbar and boxplot using PROC SGPLOT 

title "Scatter: Funding vs Growth_Rate with Regression";

proc sgplot data=work.startups_cat;

  scatter x=Funding y=Growth_Rate / markerattrs=(symbol=CircleFilled) datalabel=Startup_Name;

  reg x=Funding y=Growth_Rate / nomarkers lineattrs=(pattern=1);

  xaxis label="Funding (USD)";

  yaxis label="Growth Rate (%)";

run;

OUTPUT:

The SGPlot Procedure


title "Counts by Growth Category";

proc sgplot data=work.startups_cat;

  vbar Growth_Category / datalabel;

  xaxis label="Growth Category";

  yaxis label="Count";

run;

OUTPUT:

The SGPlot Procedure


title "Growth Rate by Industry (Boxplot)";

proc sgplot data=work.startups_cat;

  vbox Growth_Rate / category=Industry;

  yaxis label="Growth Rate (%)";

run;

OUTPUT:

The SGPlot Procedure


9) Utility macro to list top N by a chosen metric 

%macro top_n(data=work.startups_cat, var=Growth_Rate, n=5);

  proc sort data=&data out=_top descending;

    by &var;

  run;


  title "Top &n Startups by &var";

  proc print data=_top (obs=&n);

    var Startup_Name Industry &var Country Funding Employees Growth_Rate;

  run;

%mend top_n;


%top_n(data=work.startups_cat, var=Growth_Rate, n=5);

OUTPUT:

Top 5 Startups by Growth_Rate

ObsStartup_NameIndustryGrowth_RateCountryFundingEmployeesGrowth_Rate
1NanoHealth BioBiotech38.90Germany$25,000,00020038.90
2MediSoft LabsHealthcare IT32.10Canada$12,000,00012032.10
3BlockWave SystemsBlockchain27.00Singapore$4,600,0003527.00
4AeroScale InnovationsAerospace24.50United States$5,500,0004524.50
5CyberGuard AICybersecurity23.20Israel$6,100,0005223.20

/* Note: In practice above there is an Invalid in this code Find it,Correct it and Use it /*

YESTERDAY INVALID CODE ANSWER

4) PROC MEANS: Summary statistics on Sales and Rating

proc means data=work.video_games n mean median std min max mindec=2 maxdec=2;

  var Sales Rating;

  title "Summary statistics for Sales (million) and Rating";

run;

OUTPUT:

Summary statistics for Sales (million) and Rating

The MEANS Procedure

VariableNMeanMedianStd DevMinimumMaximum
Sales
Rating
12
12
80.63
8.62
29.50
8.95
103.30
0.85
6.50
7.00
350.00
9.70


Answer:
MINDEC= is allowed in Excel, some statistical tools, but NOT in SAS PROC MEANS.If You want both min & max ,You still use only MAXDEC, SAS formats both min & max outputs consistently.

link:https://sasall4you.blogspot.com/2025/12/333video-games-dataset-creation-and.html





To Visit My Previous Different Tablets Market Analysis Dataset:Click Here
To Visit My Previous Self Description Analysis Dataset:Click Here
To Visit My Previous Birds Of India Dataset:Click Here
To Visit My Previous Ganesh Mandaps In India Dataset:Click Here  



Follow Us On : 


 


--- FOLLOW OUR BLOG FOR MORE INFORMATION.

--->PLEASE DO COMMENTS AND SHARE OUR BLOG.




No comments:

Post a Comment