Sunday, 9 November 2025

304.INDIAN FREEDOM FIGHTERS DATA CREATION, VALIDATION, DISTANCE ANALYSIS, AND GEOGRAPHIC MAPPING USING PROC CONTENTS, PROC FREQ, PROC MEANS, PROC SQL, AND PROC SGPLOT

INDIAN FREEDOM FIGHTERS DATA CREATION, VALIDATION, DISTANCE ANALYSIS, AND GEOGRAPHIC MAPPING USING PROC CONTENTS, PROC FREQ, PROC MEANS, PROC SQL, AND PROC SGPLOT

 1. create dataset

options nocenter;

data work.freedom_raw;

    infile datalines truncover;

    length FIGHTER_ID $4 NAME $50 STATE $30 remaining $200;

    input FIGHTER_ID $ @;

    input remaining $char200.;


    /* Define state list manually */

    array statelist[10] $30 _temporary_;

    statelist[1] = 'Gujarat';

    statelist[2] = 'Punjab';

    statelist[3] = 'WestBengal';

    statelist[4] = 'UttarPradesh';

    statelist[5] = 'Maharashtra';

    statelist[6] = 'Jhansi';

    statelist[7] = 'Madras';

    statelist[8] = 'TamilNadu';

    statelist[9] = 'Kolkata';

    statelist[10] = 'Peshawar';


    /* Find and split by state */

    do i = 1 to dim(statelist);

        pos = find(remaining, statelist[i], 'it');

        if pos > 0 then do;

            STATE = statelist[i];

            NAME = strip(substr(remaining, 1, pos - 1));


            /* Extract remaining numbers after the state */

            numpart = substr(remaining, pos + length(STATE));

            /* Scan the numbers from the tail part */

            BIRTH_YEAR = input(scan(numpart, 1, ' '), 8.);

            LATITUDE   = input(scan(numpart, 2, ' '), best.);

            LONGITUDE  = input(scan(numpart, 3, ' '), best.);

            leave;

        end;

    end;


    drop remaining pos i numpart;

datalines;

F001 Mahatma Gandhi Gujarat 1869 21.8090 72.1360

F002 Bhagat Singh Punjab 1907 31.1471 75.3412

F003 Subhas Chandra Bose WestBengal 1897 22.5726 88.3639

F004 Jawaharlal Nehru UttarPradesh 1889 26.8467 80.9462

F005 Sardar Vallabhbhai Patel Gujarat 1875 23.0225 72.5714

F006 Rani Lakshmibai Jhansi 1828 25.4460 78.5689

F007 Bal Gangadhar Tilak Maharashtra 1856 18.5204 73.8567

F008 Lala Lajpat Rai Punjab 1865 28.7041 77.1025

F009 Annie Besant Madras 1847 13.0827 80.2707

F010 Subramania Bharati TamilNadu 1882 10.7905 79.1370

F011 Chittaranjan Das Kolkata 1870 22.5726 88.3639

F012 Khan Abdul Ghaffar Khan Peshawar 1890 34.0151 71.5249

;

run;

proc print data=freedom_raw noobs;

    title "Cleaned Indian Freedom Fighters Dataset";

run;

OUTPUT:

Cleaned Indian Freedom Fighters Dataset

FIGHTER_IDNAMESTATEBIRTH_YEARLATITUDELONGITUDE
F001Mahatma GandhiGujarat186921.809072.1360
F002Bhagat SinghPunjab190731.147175.3412
F003Subhas Chandra BoseWestBengal189722.572688.3639
F004Jawaharlal NehruUttarPradesh188926.846780.9462
F005Sardar Vallabhbhai PatelGujarat187523.022572.5714
F006Rani LakshmibaiJhansi182825.446078.5689
F007Bal Gangadhar TilakMaharashtra185618.520473.8567
F008Lala Lajpat RaiPunjab186528.704177.1025
F009Annie BesantMadras184713.082780.2707
F010Subramania BharatiTamilNadu188210.790579.1370
F011Chittaranjan DasKolkata187022.572688.3639
F012Khan Abdul Ghaffar KhanPeshawar189034.015171.5249

2. Data cleaning & validation steps

2.1 Inspect structure and quick frequency checks

proc contents data=work.freedom_raw varnum; 

run;

OUTPUT:

The CONTENTS Procedure

Data Set NameWORK.FREEDOM_RAWObservations12
Member TypeDATAVariables6
EngineV9Indexes0
Created11/10/2025 09:08:04Observation Length112
Last Modified11/10/2025 09:08:04Deleted Observations0
Protection CompressedNO
Data Set Type SortedNO
Label   
Data RepresentationSOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64  
Encodingutf-8 Unicode (UTF-8)  
Engine/Host Dependent Information
Data Set Page Size131072
Number of Data Set Pages1
First Data Page1
Max Obs per Page1168
Obs in First Data Page12
Number of Data Set Repairs0
Filename/saswork/SAS_workB17C00018A19_odaws01-apse1-2.oda.sas.com/SAS_workC0CD00018A19_odaws01-apse1-2.oda.sas.com/freedom_raw.sas7bdat
Release Created9.0401M8
Host CreatedLinux
Inode Number67110951
Access Permissionrw-r--r--
Owner Nameu63247146
File Size256KB
File Size (bytes)262144
Variables in Creation Order
#VariableTypeLen
1FIGHTER_IDChar4
2NAMEChar50
3STATEChar30
4BIRTH_YEARNum8
5LATITUDENum8
6LONGITUDENum8

proc freq data=work.freedom_raw; 

  tables STATE / nocum; 

run;

OUTPUT:

The FREQ Procedure

STATEFrequencyPercent
Gujarat216.67
Jhansi18.33
Kolkata18.33
Madras18.33
Maharashtra18.33
Peshawar18.33
Punjab216.67
TamilNadu18.33
UttarPradesh18.33
WestBengal18.33

proc print data=work.freedom_raw(obs=12);

run;

OUTPUT:

ObsFIGHTER_IDNAMESTATEBIRTH_YEARLATITUDELONGITUDE
1F001Mahatma GandhiGujarat186921.809072.1360
2F002Bhagat SinghPunjab190731.147175.3412
3F003Subhas Chandra BoseWestBengal189722.572688.3639
4F004Jawaharlal NehruUttarPradesh188926.846780.9462
5F005Sardar Vallabhbhai PatelGujarat187523.022572.5714
6F006Rani LakshmibaiJhansi182825.446078.5689
7F007Bal Gangadhar TilakMaharashtra185618.520473.8567
8F008Lala Lajpat RaiPunjab186528.704177.1025
9F009Annie BesantMadras184713.082780.2707
10F010Subramania BharatiTamilNadu188210.790579.1370
11F011Chittaranjan DasKolkata187022.572688.3639
12F012Khan Abdul Ghaffar KhanPeshawar189034.015171.5249


2.2 Coordinate sanity checks

data work.freedom_clean1;

set work.freedom_raw;

length coord_flag $50;

coord_flag='OK';

if LATITUDE=. or LONGITUDE=. then coord_flag='MISSING_COORD';

else if LATITUDE < -90 or LATITUDE > 90 then coord_flag='BAD_LAT';

else if LATITUDE < -180 or LONGITUDE > 180 then coord_flag='BAD_LON';

/* Trim and normalize text fields */

NAME = strip(NAME);

STATE = propcase(strip(translate(STATE,' ','_'))); /* ensure consistent case; underscores preserved */

run;

proc print data=work.freedom_clean1; 

run;

OUTPUT:

ObsFIGHTER_IDNAMESTATEBIRTH_YEARLATITUDELONGITUDEcoord_flag
1F001Mahatma GandhiGujarat186921.809072.1360OK
2F002Bhagat SinghPunjab190731.147175.3412OK
3F003Subhas Chandra BoseWestbengal189722.572688.3639OK
4F004Jawaharlal NehruUttarpradesh188926.846780.9462OK
5F005Sardar Vallabhbhai PatelGujarat187523.022572.5714OK
6F006Rani LakshmibaiJhansi182825.446078.5689OK
7F007Bal Gangadhar TilakMaharashtra185618.520473.8567OK
8F008Lala Lajpat RaiPunjab186528.704177.1025OK
9F009Annie BesantMadras184713.082780.2707OK
10F010Subramania BharatiTamilnadu188210.790579.1370OK
11F011Chittaranjan DasKolkata187022.572688.3639OK
12F012Khan Abdul Ghaffar KhanPeshawar189034.015171.5249OK


2.3 Validate unique FIGHTER_ID and duplicates

proc sort data=work.freedom_clean1 nodupkey dupout=work.dup_ids

out=work.freedom_sorted;

by FIGHTER_ID;

run;

proc print data=work.dup_ids;

 title 'Duplicate IDs (should be empty)'; 

run;

title;

LOG:

NOTE: There were 12 observations read from the data set WORK.FREEDOM_CLEAN1.
NOTE: 0 observations with duplicate key values were deleted.


2.4 Birth year plausibility checks

data work.freedom_validated;

set work.freedom_sorted;

length birth_flag $30;

if BIRTH_YEAR = . then birth_flag='MISSING_BIRTH_YEAR';

else if BIRTH_YEAR < 1700 or BIRTH_YEAR > 2005 then birth_flag='BIRTH_YEAR_OUT_OF_RANGE';

else birth_flag='OK';

run;

proc print data=work.freedom_validated;

run;

OUTPUT:

ObsFIGHTER_IDNAMESTATEBIRTH_YEARLATITUDELONGITUDEcoord_flagbirth_flag
1F001Mahatma GandhiGujarat186921.809072.1360OKOK
2F002Bhagat SinghPunjab190731.147175.3412OKOK
3F003Subhas Chandra BoseWestbengal189722.572688.3639OKOK
4F004Jawaharlal NehruUttarpradesh188926.846780.9462OKOK
5F005Sardar Vallabhbhai PatelGujarat187523.022572.5714OKOK
6F006Rani LakshmibaiJhansi182825.446078.5689OKOK
7F007Bal Gangadhar TilakMaharashtra185618.520473.8567OKOK
8F008Lala Lajpat RaiPunjab186528.704177.1025OKOK
9F009Annie BesantMadras184713.082780.2707OKOK
10F010Subramania BharatiTamilnadu188210.790579.1370OKOK
11F011Chittaranjan DasKolkata187022.572688.3639OKOK
12F012Khan Abdul Ghaffar KhanPeshawar189034.015171.5249OKOK


proc freq data=work.freedom_validated; 

 tables birth_flag coord_flag / nocum;

run;

OUTPUT:

The FREQ Procedure

birth_flagFrequencyPercent
OK12100.00
coord_flagFrequencyPercent
OK12100.00

2.5 Create a final clean dataset (dropping or flagging bad rows)

data work.freedom_final work.freedom_qc;

set work.freedom_validated;

if coord_flag='OK' and birth_flag='OK' then output work.freedom_final;

else output work.freedom_qc;

run;

proc print data=work.freedom_qc;

 title 'QC issues'; 

run;

LOG:

NOTE: The data set WORK.FREEDOM_QC has 0 observations and 8 variables.

NOTE: No observations in data set WORK.FREEDOM_QC.

proc print data=work.freedom_final; 

 title 'Clean records';

run;

OUTPUT:

Clean records

ObsFIGHTER_IDNAMESTATEBIRTH_YEARLATITUDELONGITUDEcoord_flagbirth_flag
1F001Mahatma GandhiGujarat186921.809072.1360OKOK
2F002Bhagat SinghPunjab190731.147175.3412OKOK
3F003Subhas Chandra BoseWestbengal189722.572688.3639OKOK
4F004Jawaharlal NehruUttarpradesh188926.846780.9462OKOK
5F005Sardar Vallabhbhai PatelGujarat187523.022572.5714OKOK
6F006Rani LakshmibaiJhansi182825.446078.5689OKOK
7F007Bal Gangadhar TilakMaharashtra185618.520473.8567OKOK
8F008Lala Lajpat RaiPunjab186528.704177.1025OKOK
9F009Annie BesantMadras184713.082780.2707OKOK
10F010Subramania BharatiTamilnadu188210.790579.1370OKOK
11F011Chittaranjan DasKolkata187022.572688.3639OKOK
12F012Khan Abdul Ghaffar KhanPeshawar189034.015171.5249OKOK

3. Distance calculation and PROC MEANS for distance stats

3.1 Haversine function in DATA step

%let lat_ref=28.6139;

%let lon_ref=77.2090;


data work.freedom_dist;

set work.freedom_final;

/* convert degrees to radians */

pi = constant('pi');

deg2rad = pi/180;

lat1 = &lat_ref. * deg2rad;

lon1 = &lon_ref. * deg2rad;

lat2 = LATITUDE * deg2rad;

lon2 = LONGITUDE * deg2rad;

dlat = lat2 - lat1;

dlon = lon2 - lon1;

a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2;

c = 2 * atan2(sqrt(a), sqrt(1-a));

earth_km = 6371; /* average Earth radius in km */

distance_km = round(earth_km * c, 0.1);

drop pi deg2rad lat1 lon1 lat2 lon2 dlat dlon a c earth_km;

run;

proc print data=work.freedom_dist;

 var FIGHTER_ID NAME STATE BIRTH_YEAR LATITUDE LONGITUDE distance_km;

run;

OUTPUT:

ObsFIGHTER_IDNAMESTATEBIRTH_YEARLATITUDELONGITUDEdistance_km
1F001Mahatma GandhiGujarat186921.809072.1360912.4
2F002Bhagat SinghPunjab190731.147175.3412334.3
3F003Subhas Chandra BoseWestbengal189722.572688.36391303.8
4F004Jawaharlal NehruUttarpradesh188926.846780.9462417.0
5F005Sardar Vallabhbhai PatelGujarat187523.022572.5714775.7
6F006Rani LakshmibaiJhansi182825.446078.5689377.1
7F007Bal Gangadhar TilakMaharashtra185618.520473.85671173.0
8F008Lala Lajpat RaiPunjab186528.704177.102514.4
9F009Annie BesantMadras184713.082780.27071755.8
10F010Subramania BharatiTamilnadu188210.790579.13701992.0
11F011Chittaranjan DasKolkata187022.572688.36391303.8
12F012Khan Abdul Ghaffar KhanPeshawar189034.015171.5249807.3


3.2 Use PROC MEANS for distance-based statistics

proc means data=work.freedom_dist n mean median min max stddev maxdec=2;

var distance_km;

run;

OUTPUT:

The MEANS Procedure

Analysis Variable : distance_km
NMeanMedianMinimumMaximumStd Dev
12930.55859.8514.401992.00599.20

3.3 Use PROC SQL for grouped aggregates by region/state

proc sql;

create table work.state_distance_summary as

select STATE,

count(*) as N,

mean(distance_km) as MEAN_DIST format=8.2,

median(distance_km) as MEDIAN_DIST format=8.2,

min(distance_km) as MIN_DIST format=8.2,

max(distance_km) as MAX_DIST format=8.2

from work.freedom_dist

group by STATE;

quit;

proc print data=work.state_distance_summary; 

run;

OUTPUT:

ObsSTATENMEAN_DISTMEDIAN_DISTMIN_DISTMAX_DIST
1Gujarat2844.05844.05775.70912.40
2Jhansi1377.10377.10377.10377.10
3Kolkata11303.801303.801303.801303.80
4Madras11755.801755.801755.801755.80
5Maharashtra11173.001173.001173.001173.00
6Peshawar1807.30807.30807.30807.30
7Punjab2174.35174.3514.40334.30
8Tamilnadu11992.001992.001992.001992.00
9Uttarpradesh1417.00417.00417.00417.00
10Westbengal11303.801303.801303.801303.80


4. Mapping with PROC SGPLOT scatter PLOT

data plot_points;

    set work.freedom_raw;

run;

proc print data=work.plot_points;

run;

OUTPUT:

ObsFIGHTER_IDNAMESTATEBIRTH_YEARLATITUDELONGITUDE
1F001Mahatma GandhiGujarat186921.809072.1360
2F002Bhagat SinghPunjab190731.147175.3412
3F003Subhas Chandra BoseWestBengal189722.572688.3639
4F004Jawaharlal NehruUttarPradesh188926.846780.9462
5F005Sardar Vallabhbhai PatelGujarat187523.022572.5714
6F006Rani LakshmibaiJhansi182825.446078.5689
7F007Bal Gangadhar TilakMaharashtra185618.520473.8567
8F008Lala Lajpat RaiPunjab186528.704177.1025
9F009Annie BesantMadras184713.082780.2707
10F010Subramania BharatiTamilNadu188210.790579.1370
11F011Chittaranjan DasKolkata187022.572688.3639
12F012Khan Abdul Ghaffar KhanPeshawar189034.015171.5249

data plot_points;

    set plot_points end=last;

    output;

    if last then do;

    NAME = "Satyavati Devi";

        STATE = "New Delhi";

        FIGHTER_ID = "F013";

BIRTH_YEAR = 1905;

        Latitude = 28.6139;

        Longitude = 77.2090;

        output;

    end;

run;

proc print data=work.plot_points;

run;

OUTPUT:

ObsFIGHTER_IDNAMESTATEBIRTH_YEARLATITUDELONGITUDE
1F001Mahatma GandhiGujarat186921.809072.1360
2F002Bhagat SinghPunjab190731.147175.3412
3F003Subhas Chandra BoseWestBengal189722.572688.3639
4F004Jawaharlal NehruUttarPradesh188926.846780.9462
5F005Sardar Vallabhbhai PatelGujarat187523.022572.5714
6F006Rani LakshmibaiJhansi182825.446078.5689
7F007Bal Gangadhar TilakMaharashtra185618.520473.8567
8F008Lala Lajpat RaiPunjab186528.704177.1025
9F009Annie BesantMadras184713.082780.2707
10F010Subramania BharatiTamilNadu188210.790579.1370
11F011Chittaranjan DasKolkata187022.572688.3639
12F012Khan Abdul Ghaffar KhanPeshawar189034.015171.5249
13F013Satyavati DeviNew Delhi190528.613977.2090

proc sgplot data=plot_points;

    scatter x=Longitude y=Latitude / datalabel=NAME markerattrs=(symbol=CircleFilled size=8);

    xaxis label='Longitude';

    yaxis label='Latitude';

    title 'Indian Freedom Fighters - Geographic Scatter Plot (Longitude x Latitude)';

run;

OUTPUT:

The SGPlot Procedure


5. Macro-driven mapping

5.1 Macro %make_scatter_map

%macro make_scatter_map(ds=work.freedom_dist, reflat=28.6139, reflon=77.2090, title=Freedom Fighters Map);


    /* Create a 1-point dataset for the reference location */

    data work._refpoint;

        length Label $30;

        Longitude = &reflon.;

        Latitude  = &reflat.;

        Label = "Reference Point";

    run;


    ods graphics / reset width=900px height=600px imagename="map_&sysdate9._&systime";


    proc sgplot data=&ds.;

        scatter x=Longitude y=Latitude /

            datalabel=NAME

            markerattrs=(symbol=CircleFilled size=8 color=blue);

    run;


    proc sgplot data=work._refpoint;

        scatter x=Longitude y=Latitude /

            datalabel=Label

            markerattrs=(symbol=StarFilled color=red size=14);

        xaxis label='Longitude';

        yaxis label='Latitude';

        title "&title";

    run;


%mend make_scatter_map;


%make_scatter_map(ds=work.freedom_raw, title=Freedom Fighters Of India);

OUTPUT:





        






To Visit My Previous E-Commerce Dataset:Click Here
To Visit My Previous Length,Input,Retain Statements:Click Here
To Visit My Previous Urban Traffic Dataset:Click Here
To Visit My Previous Home Energy Consumption Dataset:Click Here







Follow Us On : 


 


--->FOLLOW OUR BLOG FOR MORE INFORMATION.
--->PLEASE DO COMMENTS AND SHARE OUR BLOG.
















No comments:

Post a Comment