MOVIE DATA ANALYSIS USING PROC SQL | PRINT | MEANS | FREQ | SGPLOT | SORT | RANK | REPORT | MACROS | GENRES | REVIEWS | EARNINGS | FORMAT | CONTENTS | IMPORT | EXPORT | APPEND | COMPARE | TRANSPOSE | AUTOMATION
Step 1: Creating the Datasets in SAS
data movies;
infile datalines dsd truncover;
input Movie_ID :$5. Title :$50. Release_Date :date9. Director :$30. Language :$15. Duration_Min Certificate :$5.;
format Release_Date date9.;
datalines;
M001,The Last Ember,14JUL2022,Lara Quintana,English,130,PG-13
M002,Midnight Sonata,20NOV2021,Hiro Tanaka,Japanese,118,R
M003,Desert Rose,10MAR2020,Anjali Mehra,Hindi,142,U/A
M004,Galactic Trials,01JAN2023,Tom Eckhart,English,150,PG
M005,Silent Whispers,25DEC2022,Elena Costa,Spanish,95,R
M006,Shadows of Yore,18JUN2019,Marco Valli,Italian,105,PG
;
run;
proc print;run;
Output:
| Obs | Movie_ID | Title | Release_Date | Director | Language | Duration_Min | Certificate |
|---|---|---|---|---|---|---|---|
| 1 | M001 | The Last Ember | 14JUL2022 | Lara Quintana | English | 130 | PG-13 |
| 2 | M002 | Midnight Sonata | 20NOV2021 | Hiro Tanaka | Japanese | 118 | R |
| 3 | M003 | Desert Rose | 10MAR2020 | Anjali Mehra | Hindi | 142 | U/A |
| 4 | M004 | Galactic Trials | 01JAN2023 | Tom Eckhart | English | 150 | PG |
| 5 | M005 | Silent Whispers | 25DEC2022 | Elena Costa | Spanish | 95 | R |
| 6 | M006 | Shadows of Yore | 18JUN2019 | Marco Valli | Italian | 105 | PG |
data genres;
input Genre_ID $ Genre_Name $20.;
datalines;
G01 Action
G02 Drama
G03 Romance
G04 SciFi
G05 Thriller
G06 Fantasy
;
run;
proc print;run;
Output:
| Obs | Genre_ID | Genre_Name |
|---|---|---|
| 1 | G01 | Action |
| 2 | G02 | Drama |
| 3 | G03 | Romance |
| 4 | G04 | SciFi |
| 5 | G05 | Thriller |
| 6 | G06 | Fantasy |
data movie_genres;
input Movie_ID $ Genre_ID $;
datalines;
M001 G01
M001 G04
M002 G03
M002 G02
M003 G02
M003 G05
M004 G04
M004 G06
M005 G05
M006 G02
M006 G06
;
run;
proc print;run;
Output:
| Obs | Movie_ID | Genre_ID |
|---|---|---|
| 1 | M001 | G01 |
| 2 | M001 | G04 |
| 3 | M002 | G03 |
| 4 | M002 | G02 |
| 5 | M003 | G02 |
| 6 | M003 | G05 |
| 7 | M004 | G04 |
| 8 | M004 | G06 |
| 9 | M005 | G05 |
| 10 | M006 | G02 |
| 11 | M006 | G06 |
data movie_reviews;
input Movie_ID $ Critic_Score Audience_Score;
datalines;
M001 85 88
M002 91 80
M003 78 83
M004 88 92
M005 72 60
M006 81 85
;
run;
proc print;run;
Output:
| Obs | Movie_ID | Critic_Score | Audience_Score |
|---|---|---|---|
| 1 | M001 | 85 | 88 |
| 2 | M002 | 91 | 80 |
| 3 | M003 | 78 | 83 |
| 4 | M004 | 88 | 92 |
| 5 | M005 | 72 | 60 |
| 6 | M006 | 81 | 85 |
data movie_earnings;
input Movie_ID $ Domestic_M Gross_International_M;
Total_Earnings_M = sum(Domestic_M, Gross_International_M);
datalines;
M001 120 140
M002 50 60
M003 30 25
M004 200 250
M005 15 10
M006 40 55
;
run;
proc print;run;
Output:
| Obs | Movie_ID | Domestic_M | Gross_International_M | Total_Earnings_M |
|---|---|---|---|---|
| 1 | M001 | 120 | 140 | 260 |
| 2 | M002 | 50 | 60 | 110 |
| 3 | M003 | 30 | 25 | 55 |
| 4 | M004 | 200 | 250 | 450 |
| 5 | M005 | 15 | 10 | 25 |
| 6 | M006 | 40 | 55 | 95 |
Step 2: Using PROC SQL for Joining Datasets
proc sql;
create table movie_summary as
select a.Movie_ID, Title, Release_Date, Director, Language, Duration_Min, Certificate,
Critic_Score, Audience_Score, Domestic_M, Gross_International_M, Total_Earnings_M
from movies as a
left join movie_reviews as b on a.Movie_ID = b.Movie_ID
left join movie_earnings as c on a.Movie_ID = c.Movie_ID;
quit;
Step 3: Using PROC PRINT and PROC MEANS for Summary
Show the Movie Summary Table
proc print data=movie_summary noobs label;
title "Movie Summary Report";
run;
Output:
| Movie Summary Report |
| Movie_ID | Title | Release_Date | Director | Language | Duration_Min | Certificate | Critic_Score | Audience_Score | Domestic_M | Gross_International_M | Total_Earnings_M |
|---|---|---|---|---|---|---|---|---|---|---|---|
| M001 | The Last Ember | 14JUL2022 | Lara Quintana | English | 130 | PG-13 | 85 | 88 | 120 | 140 | 260 |
| M002 | Midnight Sonata | 20NOV2021 | Hiro Tanaka | Japanese | 118 | R | 91 | 80 | 50 | 60 | 110 |
| M003 | Desert Rose | 10MAR2020 | Anjali Mehra | Hindi | 142 | U/A | 78 | 83 | 30 | 25 | 55 |
| M004 | Galactic Trials | 01JAN2023 | Tom Eckhart | English | 150 | PG | 88 | 92 | 200 | 250 | 450 |
| M005 | Silent Whispers | 25DEC2022 | Elena Costa | Spanish | 95 | R | 72 | 60 | 15 | 10 | 25 |
| M006 | Shadows of Yore | 18JUN2019 | Marco Valli | Italian | 105 | PG | 81 | 85 | 40 | 55 | 95 |
Summary Statistics on Earnings
proc means data=movie_summary mean min max maxdec=1;
var Domestic_M Gross_International_M Total_Earnings_M;
title "Box Office Earnings Summary";
run;
Output:
| Box Office Earnings Summary |
| Variable | Mean | Minimum | Maximum | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
|
|
Step 4: Frequency Analysis with PROC FREQ
proc freq data=movies;
tables Certificate Language / nocum;
title "Frequency of Movie Certificates and Languages";
run;
Output:
| Frequency of Movie Certificates and
Languages |
| Certificate | Frequency | Percent |
|---|---|---|
| PG | 2 | 33.33 |
| PG-13 | 1 | 16.67 |
| R | 2 | 33.33 |
| U/A | 1 | 16.67 |
| Language | Frequency | Percent |
|---|---|---|
| English | 2 | 33.33 |
| Hindi | 1 | 16.67 |
| Italian | 1 | 16.67 |
| Japanese | 1 | 16.67 |
| Spanish | 1 | 16.67 |
Step 5: PROC GCHART (or SGPLOT) for Visualization
Plot Critic vs Audience Scores
proc sgplot data=movie_reviews;
title "🌟 Critic vs Audience Scores";
scatter x=Critic_Score y=Audience_Score / datalabel=Movie_ID;
run;
Log:
NOTE: PROCEDURE SGPLOT used (Total process time):
real time 3.21 seconds
cpu time 0.48 seconds
NOTE: Listing image output written to SGPlot1.png.
NOTE: There were 6 observations read from the data set WORK.MOVIE_REVIEWS.
Step 6: PROC SQL – Multi-genre Movie Listing
proc sql;
select a.Title, b.Genre_Name
from movie_genres as mg
inner join movies as a on mg.Movie_ID = a.Movie_ID
inner join genres as b on mg.Genre_ID = b.Genre_ID
order by a.Title;
quit;
Output:
| Title | Genre_Name |
|---|---|
| Desert Rose | Thriller |
| Desert Rose | Drama |
| Galactic Trials | SciFi |
| Galactic Trials | Fantasy |
| Midnight Sonata | Romance |
| Midnight Sonata | Drama |
| Shadows of Yore | Drama |
| Shadows of Yore | Fantasy |
| Silent Whispers | Thriller |
| The Last Ember | Action |
| The Last Ember | SciFi |
Step 7: Using SAS MACRO for Automated Reports
Define a Macro to Generate Summary for a Given Movie ID
%macro movie_report(mid);
%put Generating Report for Movie ID: ∣
proc sql;
select Title, Release_Date, Director, Language, Duration_Min
into :title, :rdate, :director, :lang, :dur
from movies
where Movie_ID = "&mid";
quit;
%put Title = &title;
%put Release Date = &rdate;
%put Director = &director;
proc sql;
select Critic_Score, Audience_Score
into :cs, :as
from movie_reviews
where Movie_ID = "&mid";
quit;
proc sql;
select Domestic_M, Gross_International_M, Total_Earnings_M
into :dm, :gi, :total
from movie_earnings
where Movie_ID = "&mid";
quit;
%put Critic Score = &cs;
%put Audience Score = &as;
%put Total Earnings = &total M;
title "📽️ Report for &title (&mid)";
proc print data=movie_summary(where=(Movie_ID="&mid")) noobs;
run;
%mend movie_report;
%movie_report(M004)
Output:
| Report for Galactic Trials (M004) |
| Title | Release_Date | Director | Language | Duration_Min |
|---|---|---|---|---|
| Galactic Trials | 01JAN2023 | Tom Eckhart | English | 150 |
| Report for Galactic Trials (M004) |
| Critic_Score | Audience_Score |
|---|---|
| 88 | 92 |
| Report for Galactic Trials (M004) |
| Domestic_M | Gross_International_M | Total_Earnings_M |
|---|---|---|
| 200 | 250 | 450 |
| Report for Galactic Trials (M004) |
| Movie_ID | Title | Release_Date | Director | Language | Duration_Min | Certificate | Critic_Score | Audience_Score | Domestic_M | Gross_International_M | Total_Earnings_M |
|---|---|---|---|---|---|---|---|---|---|---|---|
| M004 | Galactic Trials | 01JAN2023 | Tom Eckhart | English | 150 | PG | 88 | 92 | 200 | 250 | 450 |
Step 8: Advanced Filtering with SQL
List movies with:
Critic Score > 80
Total Earnings > 100 Million
proc sql;
select Title, Critic_Score, Total_Earnings_M
from movie_summary
where Critic_Score > 80 and Total_Earnings_M > 100
order by Total_Earnings_M desc;
quit;
Output:
| Title | Critic_Score | Total_Earnings_M |
|---|---|---|
| Galactic Trials | 88 | 450 |
| The Last Ember | 85 | 260 |
| Midnight Sonata | 91 | 110 |
Step 9: PROC SORT + PROC RANK – Top Performers
Ranking Movies by Total Earnings
proc sort data=movie_summary out=sorted_movies;
by descending Total_Earnings_M;
run;
proc rank data=sorted_movies out=ranked_movies ties=low descending;
var Total_Earnings_M;
ranks Earnings_Rank;
run;
Output:
proc print data=ranked_movies noobs;
var Movie_ID Title Total_Earnings_M Earnings_Rank;
title "Ranked List of Movies by Earnings";
run;
Output:
| Ranked List of Movies by Earnings |
| Movie_ID | Title | Total_Earnings_M | Earnings_Rank |
|---|---|---|---|
| M004 | Galactic Trials | 450 | 1 |
| M001 | The Last Ember | 260 | 2 |
| M002 | Midnight Sonata | 110 | 3 |
| M006 | Shadows of Yore | 95 | 4 |
| M003 | Desert Rose | 55 | 5 |
| M005 | Silent Whispers | 25 | 6 |
Step 10: Generate Genre-wise Summary using PROC SQL and GROUP BY
proc sql;
create table genre_summary as
select b.Genre_Name, count(a.Movie_ID) as Movie_Count
from movie_genres as a
inner join genres as b on a.Genre_ID = b.Genre_ID
group by b.Genre_Name;
quit;
proc print data=genre_summary;
title "Genre-Wise Movie Count";
run;
| Genre-Wise Movie Count |
| Obs | Genre_Name | Movie_Count |
|---|---|---|
| 1 | Action | 1 |
| 2 | Drama | 3 |
| 3 | Fantasy | 2 |
| 4 | Romance | 1 |
| 5 | SciFi | 2 |
| 6 | Thriller | 2 |
No comments:
Post a Comment