INPUT | FORMAT STATEMENT | PROC SQL | DISTINCT | PROC SORT | NODUP | NODUPKEY | PROC SQL GROUP BY | HAVING CLAUSE
/*Write a PROC SQL query to identify duplicate records.*/
/*Remove duplicates using PROC SORT with NODUPKEY and NODUP.*/
DATA PATIENT_VISITS;
INPUT USUBJID VISIT$ VISITDT:YYMMDD10.;
FORMAT VISITDT DATE9.;
CARDS;
101 BASELINE 2024-01-01
101 WEEK1 2024-01-08
101 WEEK1 2024-01-08
102 BASELINE 2024-01-02
;
RUN;
PROC PRINT;RUN;
OUTPUT:
| Obs | USUBJID | VISIT | VISITDT |
|---|---|---|---|
| 1 | 101 | BASELINE | 01JAN2024 |
| 2 | 101 | WEEK1 | 08JAN2024 |
| 3 | 101 | WEEK1 | 08JAN2024 |
| 4 | 102 | BASELINE | 02JAN2024 |
PROC SQL;
SELECT DISTINCT *
FROM PATIENT_VISITS;
QUIT;
OUTPUT:
| USUBJID | VISIT | VISITDT |
|---|---|---|
| 101 | BASELINE | 01JAN2024 |
| 101 | WEEK1 | 08JAN2024 |
| 102 | BASELINE | 02JAN2024 |
/*ALTERNATE METHOD FOR DUPLICATES*/
PROC SORT DATA=PATIENT_VISITS OUT=SORTDUPKEY NODUPKEY;
BY USUBJID VISIT VISITDT ;
RUN;
PROC PRINT;RUN;
OUTPUT:
| Obs | USUBJID | VISIT | VISITDT |
|---|---|---|---|
| 1 | 101 | BASELINE | 01JAN2024 |
| 2 | 101 | WEEK1 | 08JAN2024 |
| 3 | 102 | BASELINE | 02JAN2024 |
PROC SORT DATA=PATIENT_VISITS OUT=SORTDUP NODUP;
BY _ALL_;
RUN;
PROC PRINT;RUN;
OUTPUT:
| Obs | USUBJID | VISIT | VISITDT |
|---|---|---|---|
| 1 | 101 | BASELINE | 01JAN2024 |
| 2 | 101 | WEEK1 | 08JAN2024 |
| 3 | 102 | BASELINE | 02JAN2024 |
/*COUNTS OF DUPLICATES*/
PROC SQL;
SELECT * , COUNT(*) AS DUP_COUNT
FROM PATIENT_VISITS
GROUP BY USUBJID,VISIT,VISITDT
HAVING COUNT(*) > 1;
QUIT;
| USUBJID | VISIT | VISITDT | DUP_COUNT |
|---|---|---|---|
| 101 | WEEK1 | 08JAN2024 | 2 |
No comments:
Post a Comment