134.A NEW EXAMPLE : DO | SELECT | RANUNI | INT | PROC MEANS | PROC FREQ | PROC SORT | PROC UNIVARIATE | PROC TABULATE | %MACRO - %MEND | PROC BOXPLOT | PROC EXPORT | PROC CORR

A NEW EXAMPLE : DO | SELECT | RANUNI | INT | PROC MEANS | PROC FREQ | PROC SORT | PROC UNIVARIATE | PROC TABULATE | %MACRO - %MEND | PROC BOXPLOT | PROC EXPORT | PROC CORR 


/* Step 1: Creating a unique dataset */

/* This dataset contains information about employees, including their ID, Name, Department, Gender, Age, Experience, and Salary. 

   We use the RANUNI function to generate random values for each attribute. */

DATA employee_data;

    LENGTH Employee_ID $6 Name $10 Department $15 Gender $1 Salary 8. Age 8. Experience 8.;

    FORMAT Salary DOLLAR8.2;

    DO i = 1001 TO 1500;

        Employee_ID = CAT('E', PUT(i, 4.));

        Name = CAT('Emp', PUT(i, 4.));

        

        /* Assigning a random department using SELECT statement */

        department_choice = INT(RANUNI(1234)*5) + 1;

        SELECT (department_choice);

            WHEN (1) Department = 'HR';

            WHEN (2) Department = 'IT';

            WHEN (3) Department = 'Finance';

            WHEN (4) Department = 'Marketing';

            WHEN (5) Department = 'Sales';

            OTHERWISE Department = 'Unknown';

        END;

        

        /* Assigning random gender */

        IF RANUNI(2345) < 0.5 THEN Gender = 'M';

        ELSE Gender = 'F';

        

        /* Assigning random age between 20 and 50 */

        Age = INT(RANUNI(3456) * 30) + 20;

        

        /* Assigning random experience between 1 and 10 years */

        Experience = INT(RANUNI(4567) * 10) + 1;

        

        /* Assigning a random salary between $30,000 and $80,000 */

        Salary = INT(RANUNI(5678) * 50000) + 30000;

        

        OUTPUT;

    END;

    DROP i department_choice;

RUN;

/* Step 2: Display first few observations */

/* This step helps us verify if the dataset was created correctly. */

PROC PRINT DATA=employee_data (OBS=10);

    TITLE "First 10 Records of Employee Dataset";

RUN;


Output:   

                                                    First 10 Records of Employee Dataset

Obs Employee_ID Name Department Gender Salary Age Experience
1 E1001 Emp1001 IT M 42879.00 31 1
2 E1002 Emp1002 HR M 37156.00 23 5
3 E1003 Emp1003 HR M 78372.00 22 10
4 E1004 Emp1004 Marketing M 55513.00 31 4
5 E1005 Emp1005 IT M 36200.00 27 5
6 E1006 Emp1006 Marketing M 41724.00 35 10
7 E1007 Emp1007 Marketing M 62042.00 23 6
8 E1008 Emp1008 Marketing F 51051.00 49 6
9 E1009 Emp1009 Finance F 39050.00 41 4
10 E1010 Emp1010 Sales M 58721.00 29 10


/* Step 3: Descriptive Statistics using PROC MEANS */

/* This procedure provides statistical summaries, such as mean, median, minimum, maximum, and standard deviation for Age, Experience, and Salary. */

PROC MEANS DATA=employee_data MEAN MEDIAN MIN MAX STD;

    VAR Age Experience Salary;

    TITLE "Descriptive Statistics for Age, Experience, and Salary";

RUN;


Output:

                                              Descriptive Statistics for Age, Experience, and Salary

                                                               The MEANS Procedure

Variable Mean Median Minimum Maximum Std Dev
Age
Experience
Salary
33.8540000
5.4600000
56015.82
34.0000000
5.5000000
56434.00
20.0000000
1.0000000
30364.00
49.0000000
10.0000000
79968.00
8.6842107
2.8454513
14701.34


/* Step 4: Frequency Distribution using PROC FREQ */

/* This procedure calculates the frequency distribution of employees based on Department and Gender. */

PROC FREQ DATA=employee_data;

    TABLES Department Gender / NOROW NOCOL NOPERCENT;

    TITLE "Frequency Distribution of Departments and Gender";

RUN;


Output:

                                            Frequency Distribution of Departments and Gender

                                                             The FREQ Procedure

Department Frequency Cumulative
Frequency
Finance 101 101
HR 102 203
IT 100 303
Marketing 102 405
Sales 95 500


Gender Frequency Cumulative
Frequency
F 248 248
M 252 500


/* Step 5: Sorting Data using PROC SORT */

/* Sorting data helps organize it for further analysis. Here, we sort employees by department and then by descending salary. */

PROC SORT DATA=employee_data OUT=sorted_data;

    BY Department DESCENDING Salary;

RUN;

/* Displaying the top 10 employees after sorting */

PROC PRINT DATA=sorted_data (OBS=10);

    TITLE "Top 10 Salaries in Each Department";

RUN;


Output:

                                               Top 10 Salaries in Each Department

Obs Employee_ID Name Department Gender Salary Age Experience
1 E1170 Emp1170 Finance M 79745.00 42 8
2 E1200 Emp1200 Finance M 79431.00 44 8
3 E1110 Emp1110 Finance F 79093.00 37 10
4 E1358 Emp1358 Finance F 78470.00 41 9
5 E1328 Emp1328 Finance M 77814.00 47 3
6 E1334 Emp1334 Finance F 77647.00 32 4
7 E1067 Emp1067 Finance M 77094.00 37 6
8 E1107 Emp1107 Finance M 76994.00 25 1
9 E1267 Emp1267 Finance M 76868.00 30 2
10 E1207 Emp1207 Finance F 76585.00 31 3


/* Step 6: Generating Summary Reports using PROC REPORT */

/* This report groups data by Department and Gender and calculates the total salary per group*/

PROC REPORT DATA=employee_data NOWD;

    COLUMN Department Gender Salary;

    DEFINE Department / GROUP;

    DEFINE Gender / GROUP;

    DEFINE Salary / ANALYSIS SUM FORMAT=DOLLAR11.2 "Total Salary";

    TITLE "Total Salary Paid by Department and Gender";

RUN;


Output:

                                               Total Salary Paid by Department and Gender

Department Gender Total Salary
Finance F $2927937.00
  M $3024388.00
HR F $3019456.00
  M $2684792.00
IT F $2718223.00
  M $2911039.00
Marketing F $2462906.00
  M $2925769.00
Sales F $2682550.00
  M $2650849.00


/* Step 7: Using PROC SQL for Data Analysis */

/* SQL in SAS is useful for summarizing data in a structured format. Here, we count employees per department and compute the average salary. */

PROC SQL;

    SELECT Department, COUNT(*) AS Employee_Count, AVG(Salary) AS Avg_Salary FORMAT=DOLLAR8.2

    FROM employee_data

    GROUP BY Department;

QUIT;


Output:

Department Employee_Count Avg_Salary
Finance 101 58933.91
HR 102 55924.00
IT 100 56292.62
Marketing 102 52830.15
Sales 95 56141.04


/* Step 8: Checking Distribution using PROC UNIVARIATE */

/* This procedure analyzes the distribution of Salary and generates a histogram to visualize it. */

PROC UNIVARIATE DATA=employee_data;

    VAR Salary;

    HISTOGRAM / NORMAL;

    TITLE "Salary Distribution Analysis";

RUN;


Output:

                                                    Salary Distribution Analysis 

                                                    The UNIVARIATE Procedure 

                                                                 Variable: Salary

Moments
N 500 Sum Weights 500
Mean 56015.818 Sum Observations 28007909
Std Deviation 14701.3361 Variance 216129283
Skewness -0.1159668 Kurtosis -1.1932332
Uncorrected SS 1.67673E12 Corrected SS 1.07849E11
Coeff Variation 26.2449726 Std Error Mean 657.463737

Basic Statistical Measures
Location Variability
Mean 56015.82 Std Deviation 14701
Median 56434.00 Variance 216129283
Mode 38115.00 Range 49604
    Interquartile Range 25777

Fitted Normal Distribution for Salary

Parameters for Normal Distribution
Parameter Symbol Estimate
Mean Mu 56015.82
Std Dev Sigma 14701.34

Goodness-of-Fit Tests for Normal Distribution
Test Statistic p Value
Kolmogorov-Smirnov D 0.06930920 Pr > D <0.010
Cramer-von Mises W-Sq 0.78210857 Pr > W-Sq <0.005
Anderson-Darling A-Sq 6.09638867 Pr > A-Sq <0.005

Quantiles for Normal Distribution
Percent Quantile
Observed Estimated
1.0 30670.0 21815.4
5.0 32026.5 31834.3
10.0 34272.5 37175.3
25.0 43301.5 46099.9
50.0 56434.0 56015.8
75.0 69078.5 65931.7
90.0 75243.5 74856.3
95.0 77752.0 80197.4
99.0 79417.0 90216.2

/* Step 9: Advanced Analysis with PROC TABULATE */

/* This procedure generates a cross-tabulated summary of data. Here, we summarize salary by Department and Gender. */

PROC TABULATE DATA=employee_data;

    CLASS Department Gender;

    VAR Salary;

    TABLE Department * Gender, Salary * (SUM MEAN MIN MAX);

    TITLE "Salary Statistics by Department and Gender";

RUN;


Output:

                                  Salary Statistics by Department and Gender

  Salary
Sum Mean Min Max
Department Gender 2927937.00 58558.74 31150.00 79093.00
Finance F
M 3024388.00 59301.73 30876.00 79745.00
HR F 3019456.00 55915.85 31568.00 79656.00
M 2684792.00 55933.17 30623.00 79057.00
IT F 2718223.00 56629.65 30987.00 79968.00
M 2911039.00 55981.52 30488.00 78767.00
Marketing F 2462906.00 50263.39 30666.00 78742.00
M 2925769.00 55203.19 30364.00 79024.00
Sales F 2682550.00 57075.53 30674.00 79373.00
M 2650849.00 55226.02 30484.00 79795.00


/* Step 10: Box Plot Visualization using PROC BOXPLOT */

/* Boxplots help in understanding the spread of salaries across different departments. */

PROC BOXPLOT DATA=employee_data;

    PLOT Salary*Department;

    TITLE "Boxplot of Salaries by Department";

RUN;


/* Step 11: Exporting Data using PROC EXPORT */

/* If we need to share the dataset with external applications, we can export it as a CSV file. */

PROC EXPORT DATA=employee_data

    OUTFILE="/home/user/employee_data.csv"

    DBMS=CSV

    REPLACE;

RUN;


/* Step 12: Creating a Macro for Salary Analysis */

/* This macro automates salary analysis for a given department. */

%MACRO SalaryAnalysis(dept);

    PROC SQL;

        SELECT * FROM employee_data

        WHERE Department = "&dept.";

    QUIT;

%MEND SalaryAnalysis;


/* Example: Running the macro for IT department */

%SalaryAnalysis(IT);


Output:

Employee_ID Name Department Gender Salary Age Experience
E1001 Emp1001 IT M 42879.00 31 1
E1005 Emp1005 IT M 36200.00 27 5
E1011 Emp1011 IT F 60132.00 36 7
E1012 Emp1012 IT M 75946.00 28 8
E1014 Emp1014 IT F 38115.00 41 5
E1017 Emp1017 IT M 35748.00 36 10
E1023 Emp1023 IT M 57832.00 46 8
E1033 Emp1033 IT F 74660.00 42 8
E1043 Emp1043 IT F 61208.00 41 4
E1049 Emp1049 IT M 36391.00 21 2


/* Step 13: Correlation Analysis using PROC CORR */

/* This procedure examines the relationship between Age, Experience, and Salary. */

PROC CORR DATA=employee_data;

    VAR Age Experience Salary;

    TITLE "Correlation Analysis between Age, Experience, and Salary";

RUN;


Ouptut:
                                   Correlation Analysis between Age, Experience, and Salary

                                                             The CORR Procedure

3 Variables: Age Experience Salary

Simple Statistics
Variable N Mean Std Dev Sum Minimum Maximum
Age 500 33.85400 8.68421 16927 20.00000 49.00000
Experience 500 5.46000 2.84545 2730 1.00000 10.00000
Salary 500 56016 14701 28007909 30364 79968

Pearson Correlation Coefficients, N = 500
Prob > |r| under H0: Rho=0
  Age Experience Salary
Age
1.00000
 
0.06128
0.1713
-0.01875
0.6757
Experience
0.06128
0.1713
1.00000
 
0.06660
0.1370
Salary
-0.01875
0.6757
0.06660
0.1370
1.00000
 


PRACTICE AND COMMENT YOUR CODE: 

-->PLEASE FOLLOW OUR BLOG FOR MORE UPDATES.

TO FOLLOW OUR TELEGRAM CHANNEL CLICK HERE

Comments