144.SAS QUESTIONS

                                  SAS QUESTIONS


DATA BASEBALL;

 INPUT PlayerID $ Team $ Position $ Salary;

 DATALINES;

001 TeamA Pitcher 500000

002 TeamB Catcher 600000

003 TeamA Pitcher 500000

004 TeamC Shortstop 700000

005 TeamB Catcher 600000

;

RUN;

PROC PRINT;RUN;


/*------------------------QUESTION - 1 ----------------------------------*/

/*How do you identify and remove duplicate records in a SAS dataset?*/


/*Hint: Use the PROC SORT procedure with the NODUPKEY or DUPOUT options to identify 

and eliminate duplicates based on key variables.?*/


PROC SORT DATA=BASEBALL OUT=BASESORT NODUPKEY DUPOUT=OUTVAR;

 BY TEAM POSITION;

RUN;



DATA GPA;

   INPUT StudentID $ Math_GPA Science_GPA English_GPA;

   DATALINES;

S001 3.5 3.7 3.8

S002 3.2 3.3 3.5

S003 3.8 3.9 4.0

;

RUN;

PROC PRINT;RUN;


/*------------------------QUESTION - 2 ----------------------------------*/

/*How can you transpose a wide dataset to a long format in SAS?*/


/*Hint: Utilize the PROC TRANSPOSE procedure to restructure the data from wide 

to long format.*/


PROC TRANSPOSE DATA=GPA OUT=GPATRAN(RENAME=(COL1=GPA) DROP=_NAME_);

 BY STUDENTID;

 VAR Math_GPA Science_GPA English_GPA;

RUN;

PROC PRINT;RUN;


PROC TRANSPOSE DATA=GPA OUT=GPATRAN(RENAME=(COL1=GPA _NAME_=SUBJECT));

 BY STUDENTID;

 VAR Math_GPA Science_GPA English_GPA;

RUN;

PROC PRINT;RUN;



DATA HMEQ;

    CALL STREAMINIT(12345); /* Sets the seed for reproducibility */

    DO ID = 1 TO 10000;

        LoanAmount = RAND('NORMAL', 15000, 5000);

        CreditScore = RAND('NORMAL', 700, 50);

        Income = RAND('NORMAL', 60000, 10000);

        OUTPUT;

    END;

RUN;

PROC PRINT;RUN;


/*------------------------QUESTION - 3 ----------------------------------*/

/*What techniques can you use to efficiently handle large datasets in SAS?*/


/*Hint: Implement strategies such as data compression, indexing,

and optimizing I/O operations to enhance performance*/


/*Efficiently managing large datasets in SAS is crucial for optimizing performance and resource utilization. Here are several advanced techniques to consider*/

/**/

/*Data Compression*/

/**/

/*Purpose: Reduces the physical storage size of datasets, leading to decreased Input/Output (I/O) operations and faster data processing*/

/**/

/*Implementation: Apply the COMPRESS option during dataset creation to enable compression*/

/**/

data compressed_data (compress=yes);

    set large_data;

run;

/*Considerations: While compression saves storage space and can enhance I/O performance, it may increase CPU usage due to the overhead of compressing and decompressing data.?*/

/**/

/**/

/*Indexing:*/

/**/

/*Purpose: Facilitates faster data retrieval by creating indexes on specific variables, which is particularly beneficial for large datasets with frequent search or subset operations*/

/**/

/*Implementation: Create an index using the PROC DATASETS procedure or within PROC SQL*/

/**/

proc datasets library=work;

    modify large_data;

    index create var1;

quit;

/*Considerations: While indexes speed up data access, they consume additional storage space and may slow down data insertion and updating processes. Therefore, it's advisable to index only those variables that are frequently used in WHERE clauses or join conditions*/

/**/

/**/

/*Optimizing I/O Operations:*/

/**/

/*Purpose: Minimizes the time and resources spent on reading from and writing to disk, which is a common bottleneck when handling large datasets*/

/**/

/*Techniques:*/

/**/

/*Selective Variable Loading: Use the KEEP or DROP statements to read only necessary variables into memory, reducing the amount of data processed.*/

/**/

data subset_data;

    set large_data (keep=var1 var2 var3);

run;

/**/

/*Early Data Filtering: Apply the WHERE clause to filter data during the read process, limiting the number of observations loaded into memory.*/

/**/

data filtered_data;

    set large_data;

    where condition;

run;

/*Efficient Procedures: Utilize procedures optimized for large datasets, such as PROC FEDSQL, which is designed for high-performance SQL queries within SAS.*/

/**/

proc fedsql;

    create table results as

    select var1, var2, count(*) as frequency

    from large_data

    group by var1, var2;

quit;

/**/

/**/

/*Data Partitioning: Sort and partition data to enable parallel processing, distributing the workload across multiple threads or machines.*/

/**/

proc sort data=large_data out=sorted_data;

    by partition_var;

run;

/*Memory Management: Adjust system options like MEMSIZE and SORTSIZE to allocate sufficient memory for data processing, reducing the need for disk-based operations.*/

/**/

options memsize=8G sortsize=4G;

/**/

/*Considerations: Efficient I/O operations are critical for performance. Techniques such as indexing, compression, and careful data management can significantly reduce I/O load and enhance processing speeds*/

/**/

/*Implementing these strategies can lead to more efficient handling of large datasets in SAS, resulting in improved performance and resource utilization.*/


DATA BUSINESS1;

    INPUT CompanyID $ CompanyName $ Industry $;

    DATALINES;

C001 AlphaTech Technology

C002 BetaCorp Finance

C003 GammaInc Healthcare

;

RUN;

PROC PRINT;RUN;


DATA BUSINESS2;

    INPUT CompanyID $ CEO $ Revenue;

    DATALINES;

C001 JohnDoe 5000000

C002 JaneSmith 7000000

C003 JimBrown 6000000

;

RUN;

PROC PRINT;RUN;


/*------------------------QUESTION - 4 ----------------------------------*/

/*How do you merge two datasets with a one-to-one correspondence in SAS?*/


/*Hint: Use the MERGE statement in the DATA step, ensuring both */

/*datasets are sorted by the key variable.*/


DATA BUSINESS;

 MERGE BUSINESS1 

       BUSINESS2;

 BY COMPANYID;

RUN;

PROC PRINT;RUN;



DATA PATIENT;

    INPUT PatientID $ Age Gender $ Diagnosis $;

    DATALINES;

P001 45 M Diabetes

P002 34 F Hypertension

P003 29 M Asthma

;

RUN;

PROC PRINT;RUN;


/*------------------------QUESTION - 5 ----------------------------------*/

/*How can you create a custom format in SAS to categorize data values?*/


/*Hint: Use the PROC FORMAT procedure to define custom formats for data categorization.*/


PROC FORMAT;

 VALUE  $GENDER   'M' = 'MALE'

                  'F' = 'FEMALE';

RUN;


DATA PATIENT;

 SET PATIENT;

 FORMAT GENDER $GENDER.;

RUN;

PROC PRINT;RUN;



DATA DRUG;

    INPUT PatientID $ DrugGroup $ ResponseTime;

    DATALINES;

P001 A 5.2

P002 B 6.3

P003 A 4.8

P004 B 5.9

P005 A 5.1

;

RUN;

PROC PRINT;RUN;


/*------------------------QUESTION - 6 ----------------------------------*/

/*How do you calculate summary statistics for groups within a dataset in SAS?*/


/*Hint: Use the PROC SUMMARY or PROC MEANS procedures with the CLASS statement 

to compute statistics for each group*/


PROC MEANS DATA=DRUG;

 CLASS DRUGGROUP;

 VAR RESPONSETIME;

RUN;



DATA MINING;

    INPUT SiteID $ Depth Mineral Content Purity;

    DATALINES;

S001 100 20.5 98.5 . 

S002 150 22.0 97.8 .

S003 120 19.8 99.1 .

;

RUN;

PROC PRINT;RUN;


/*------------------------QUESTION - 7 ----------------------------------*/

/*How can you efficiently remove unwanted variables from a dataset in SAS?*/


/*Hint: Use the DROP statement or the DROP= data set option to exclude specific variables.*/


DATA MINING;

 SET MINING;

 DROP PURITY;

RUN;

PROC PRINT;RUN;


DATA IRIS;

    INPUT SepalLength SepalWidth PetalLength PetalWidth Species $;

    DATALINES;

5.1 3.5 1.4 0.2 Setosa

6.2 3.4 5.4 2.3 Virginica

5.9 3.0 4.2 1.5 Versicolor

;

RUN;

PROC PRINT;RUN;


/*------------------------QUESTION - 8 ----------------------------------*/

/*How can you control the order of variables in a SAS dataset?*/


/*Hint: Use the RETAIN statement in the DATA step to specify the order of variable*/


DATA IRIS;

 SET IRIS;

 RETAIN SepalLength SepalWidth PetalLength PetalWidth Species;

RUN;

PROC PRINT;RUN;

 

/*------------------------QUESTION - 9 ----------------------------------*/

/*How do you handle missing values in SAS datasets?*/


/*Hint: Use conditional statements to identify and

manage missing values during data processing.*/


DATA MINING2;

 SET MINING;

 IF MISSING(PURITY) THEN PURITY=0;

RUN;

PROC PRINT;RUN;


PRACTICE AND COMMENT YOUR CODE: 

-->PLEASE FOLLOW OUR BLOG FOR MORE UPDATES.

TO FOLLOW OUR TELEGRAM CHANNEL CLICK HERE

Comments