145.A COMPREHENSIVE SAS PROGRAMMING GUIDE USING A FICTIONAL WORLD CINEMA DATASET: EXPLORING GLOBAL FILM INDUSTRY INSIGHTS THROUGH DATA TRANSFORMATION, STATISTICAL PROCEDURES, VISUALIZATION, AND PREDICTIVE MODELING

A COMPREHENSIVE SAS PROGRAMMING GUIDE USING A FICTIONAL WORLD CINEMA DATASET: EXPLORING GLOBAL FILM INDUSTRY INSIGHTS THROUGH DATA TRANSFORMATION, STATISTICAL PROCEDURES, VISUALIZATION, AND PREDICTIVE MODELING


data world_cinema;

length Title $50 Genre $20 Country $20 Language $20;

do MovieID = 1 to 200;

Title = catt("Movie_", MovieID);

Genre = scan("Action Comedy Drama Thriller Horror Sci-Fi Fantasy", ceil(ranuni(123)*7), ' ');

Country = scan("USA UK India France Japan South Korea Germany Brazil Canada", ceil(ranuni(321)*9), ' ');

Language = scan("English Hindi French Japanese Korean German Portuguese", ceil(ranuni(567)*7), ' ');

Release_Year = int(2015 + ranuni(890)*10);

Budget = round(5 + ranuni(102)*95, 0.1); /* Between $5M and $100M */

Box_Office = round(Budget + ranuni(321)*150, 0.1);

IMDB_Rating = round(5 + ranuni(812)*5, 0.1); /* 5.0 to 10.0 */

RT_Score = round(50 + ranuni(432)*50);

Duration = int(90 + ranuni(999)*60); /* 90 to 150 minutes */

Awards_Won = int(ranuni(345)*10);

Screens = int(100 + ranuni(765)*3000);

output;

end;

drop MovieID;

run;

proc print data=world_cinema (obs=10); 

title "Sample Movies Data";

run;


Output:

Obs Title Genre Country Language Release_Year Budget Box_Office IMDB_Rating RT_Score Duration Awards_Won Screens
1 Movie_1 Sci-Fi India Hindi 2024 38.9 72.1 8.9 70 97 1 2428
2 Movie_2 Thriller Brazil Hindi 2022 57.7 137.4 9.3 57 141 6 2409
3 Movie_3 Horror India Japanese 2024 9.4 101.6 8.3 69 125 3 1717
4 Movie_4 Comedy Japan Japanese 2015 15.3 102.6 8.4 74 98 7 2982
5 Movie_5 Sci-Fi Brazil English 2021 89.7 132.3 8.3 90 137 3 2436
6 Movie_6 Thriller Korea German 2015 11.9 49.9 6.2 90 131 2 2671
7 Movie_7 Thriller Korea Korean 2020 12.2 106.3 8.7 91 95 5 1154
8 Movie_8 Comedy USA Hindi 2024 5.8 41.7 5.1 92 130 0 1127
9 Movie_9 Sci-Fi UK German 2024 95.5 128.8 6.4 67 139 6 1027
10 Movie_10 Horror Germany German 2023 87.5 183.7 9.2 97 102 8 1139


proc contents data=world_cinema;

run;


Output:

                                                          The CONTENTS Procedure

Data Set Name WORK.WORLD_CINEMA Observations 200
Member Type DATA Variables 12
Engine V9 Indexes 0
Created 14/09/2015 00:05:41 Observation Length 176
Last Modified 14/09/2015 00:05:41 Deleted Observations 0
Protection   Compressed NO
Data Set Type   Sorted NO
Label      
Data Representation WINDOWS_64    
Encoding wlatin1 Western (Windows)    


Engine/Host Dependent Information
Data Set Page Size 65536
Number of Data Set Pages 1
First Data Page 1
Max Obs per Page 371
Obs in First Data Page 200
Number of Data Set Repairs 0
ExtendObsCounter YES
Filename C:\Users\Lenovo\AppData\Local\Temp\SAS Temporary Files\_TD6140_DESKTOP-QFAA4KV_\world_cinema.sas7bdat
Release Created 9.0401M2
Host Created X64_8HOME


Alphabetic List of Variables and Attributes
# Variable Type Len
11 Awards_Won Num 8
7 Box_Office Num 8
6 Budget Num 8
3 Country Char 20
10 Duration Num 8
2 Genre Char 20
8 IMDB_Rating Num 8
4 Language Char 20
9 RT_Score Num 8
5 Release_Year Num 8
12 Screens Num 8
1 Title Char 50


proc means data=world_cinema mean std min max n;

var Budget Box_Office IMDB_Rating RT_Score Duration Awards_Won Screens;

run;


Output:

                                                            The MEANS Procedure

Variable Mean Std Dev Minimum Maximum N
Budget
Box_Office
IMDB_Rating
RT_Score
Duration
Awards_Won
Screens
52.6815000
125.9395000
7.3215000
77.7450000
118.1400000
4.5450000
1739.99
27.0070939
49.7498364
1.4251819
14.6105206
16.7010486
2.8598723
802.6308217
5.6000000
16.9000000
5.0000000
50.0000000
90.0000000
0
137.0000000
99.5000000
237.8000000
10.0000000
100.0000000
149.0000000
9.0000000
3087.00
200
200
200
200
200
200
200


proc freq data=world_cinema;

tables Genre Country Language;

run;


Output:

                                                         The FREQ Procedure

Genre Frequency Percent Cumulative
Frequency
Cumulative
Percent
Action 27 13.50 27 13.50
Comedy 20 10.00 47 23.50
Drama 25 12.50 72 36.00
Fantasy 33 16.50 105 52.50
Horror 29 14.50 134 67.00
Sci-Fi 29 14.50 163 81.50
Thriller 37 18.50 200 100.00


Country Frequency Percent Cumulative
Frequency
Cumulative
Percent
Brazil 31 15.50 31 15.50
France 27 13.50 58 29.00
Germany 19 9.50 77 38.50
India 26 13.00 103 51.50
Japan 16 8.00 119 59.50
Korea 19 9.50 138 69.00
South 19 9.50 157 78.50
UK 21 10.50 178 89.00
USA 22 11.00 200 100.00


Language Frequency Percent Cumulative
Frequency
Cumulative
Percent
English 34 17.00 34 17.00
French 17 8.50 51 25.50
German 33 16.50 84 42.00
Hindi 33 16.50 117 58.50
Japanese 23 11.50 140 70.00
Korean 26 13.00 166 83.00
Portuguese 34 17.00 200 100.00


proc means data=world_cinema mean std;

class Genre;

var Budget Box_Office IMDB_Rating;

run;


Output:

                                                            The MEANS Procedure

Genre N Obs Variable Mean Std Dev
Action 27
Budget
Box_Office
IMDB_Rating
54.9222222
142.9962963
6.7629630
29.1830870
56.5423223
1.3124106
Comedy 20
Budget
Box_Office
IMDB_Rating
41.0700000
117.5650000
7.4000000
30.5840189
49.5787654
1.4253347
Drama 25
Budget
Box_Office
IMDB_Rating
57.0120000
127.8080000
7.4400000
18.4919649
47.1742507
1.4770466
Fantasy 33
Budget
Box_Office
IMDB_Rating
55.8484848
125.8515152
7.3393939
27.7726048
52.4093630
1.4924685
Horror 29
Budget
Box_Office
IMDB_Rating
48.5965517
118.0724138
7.8241379
27.6861296
44.5158631
1.2968795
Sci-Fi 29
Budget
Box_Office
IMDB_Rating
63.5965517
133.4551724
7.2206897
26.5215341
50.5380944
1.3906266
Thriller 37
Budget
Box_Office
IMDB_Rating
46.2189189
117.1108108
7.2756757
24.7595418
46.7674791
1.4844388


proc univariate data=world_cinema;

var Box_Office;

histogram / normal;

run;


Output:

                                                      The UNIVARIATE Procedure
                                                          Variable: Box_Office

Moments
N 200 Sum Weights 200
Mean 125.9395 Sum Observations 25187.9
Std Deviation 49.7498364 Variance 2475.04622
Skewness 0.04834006 Kurtosis -0.7232748
Uncorrected SS 3664685.73 Corrected SS 492534.198
Coeff Variation 39.5029648 Std Error Mean 3.51784467


Basic Statistical Measures
Location Variability
Mean 125.9395 Std Deviation 49.74984
Median 125.6500 Variance 2475
Mode 132.9000 Range 220.90000
    Interquartile Range 75.00000


Tests for Location: Mu0=0
Test Statistic p Value
Student's t t 35.80019 Pr > |t| <.0001
Sign M 100 Pr >= |M| <.0001
Signed Rank S 10050 Pr >= |S| <.0001


Quantiles (Definition 5)
Level Quantile
100% Max 237.80
99% 230.65
95% 206.35
90% 194.05
75% Q3 164.25
50% Median 125.65
25% Q1 89.25
10% 58.95
5% 47.20
1% 31.60
0% Min 16.90


Extreme Observations
Lowest Highest
Value Obs Value Obs
16.9 194 223.9 46
31.3 48 224.8 124
31.9 154 225.3 52
33.3 198 236.0 89
34.8 109 237.8 115

                                                     The UNIVARIATE Procedure
                                              Fitted Normal Distribution for Box_Office

Parameters for Normal Distribution
Parameter Symbol Estimate
Mean Mu 125.9395
Std Dev Sigma 49.74984


Goodness-of-Fit Tests for Normal Distribution
Test Statistic p Value
Kolmogorov-Smirnov D 0.04636148 Pr > D >0.150
Cramer-von Mises W-Sq 0.06419600 Pr > W-Sq >0.250
Anderson-Darling A-Sq 0.49421938 Pr > A-Sq 0.221


Quantiles for Normal Distribution
Percent Quantile
Observed Estimated
1.0 31.6000 10.2041
5.0 47.2000 44.1083
10.0 58.9500 62.1825
25.0 89.2500 92.3837
50.0 125.6500 125.9395
75.0 164.2500 159.4953
90.0 194.0500 189.6965
95.0 206.3500 207.7707
99.0 230.6500 241.6749


proc sgplot data=world_cinema;

scatter x=Budget y=Box_Office / group=Genre;

title "Budget vs Box Office Revenue by Genre";

run;


proc sql;

create table avg_rating as

select Country, mean(IMDB_Rating) as Avg_IMDB

from world_cinema

group by Country;

quit;

proc print;run;


Output:

Obs Country Avg_IMDB
1 Brazil 7.15484
2 France 7.36667
3 Germany 7.94737
4 India 7.55000
5 Japan 7.40625
6 Korea 6.90526
7 South 7.05789
8 UK 7.23333
9 USA 7.30000


proc sgplot data=avg_rating;

vbar Country / response=Avg_IMDB;

title "Average IMDb Rating by Country";

run;


proc sql;

select Genre, avg(Budget) as Avg_Budget, avg(Box_Office) as Avg_Revenue

from world_cinema

group by Genre

having avg(Box_Office) > avg(Budget)*1.5;

quit;


Output:

Genre Avg_Budget Avg_Revenue
Action 54.92222 142.9963
Comedy 41.07 117.565
Drama 57.012 127.808
Fantasy 55.84848 125.8515
Horror 48.59655 118.0724
Sci-Fi 63.59655 133.4552
Thriller 46.21892 117.1108


proc corr data=world_cinema;

var Budget Box_Office IMDB_Rating RT_Score Duration Screens;

run;


Output:

                                                                 The CORR Procedure

6 Variables: Budget Box_Office IMDB_Rating RT_Score Duration Screens


Simple Statistics
Variable N Mean Std Dev Sum Minimum Maximum
Budget 200 52.68150 27.00709 10536 5.60000 99.50000
Box_Office 200 125.93950 49.74984 25188 16.90000 237.80000
IMDB_Rating 200 7.32150 1.42518 1464 5.00000 10.00000
RT_Score 200 77.74500 14.61052 15549 50.00000 100.00000
Duration 200 118.14000 16.70105 23628 90.00000 149.00000
Screens 200 1740 802.63082 347997 137.00000 3087


Pearson Correlation Coefficients, N = 200
Prob > |r| under H0: Rho=0
  Budget Box_Office IMDB_Rating RT_Score Duration Screens
Budget
1.00000
 
0.54153
<.0001
-0.06033
0.3961
0.08417
0.2360
0.01682
0.8132
0.04974
0.4842
Box_Office
0.54153
<.0001
1.00000
 
0.00040
0.9955
0.04909
0.4900
-0.07513
0.2903
0.08856
0.2124
IMDB_Rating
-0.06033
0.3961
0.00040
0.9955
1.00000
 
0.07312
0.3035
0.00418
0.9532
0.08811
0.2147
RT_Score
0.08417
0.2360
0.04909
0.4900
0.07312
0.3035
1.00000
 
0.09025
0.2038
-0.02476
0.7279
Duration
0.01682
0.8132
-0.07513
0.2903
0.00418
0.9532
0.09025
0.2038
1.00000
 
-0.02636
0.7110
Screens
0.04974
0.4842
0.08856
0.2124
0.08811
0.2147
-0.02476
0.7279
-0.02636
0.7110
1.00000
 


proc rank data=world_cinema out=ranked_movies descending ties=low;

var Box_Office;

ranks Revenue_Rank;

run;

proc print data=ranked_movies (obs=10);

var Title Box_Office Revenue_Rank;

title "Top 10 Movies by Revenue";

run;


Output:

                                                     Top 10 Movies by Revenue

Obs Title Box_Office Revenue_Rank
1 Movie_1 72.1 167
2 Movie_2 137.4 82
3 Movie_3 101.6 133
4 Movie_4 102.6 132
5 Movie_5 132.3 93
6 Movie_6 49.9 188
7 Movie_7 106.3 127
8 Movie_8 41.7 192
9 Movie_9 128.8 97
10 Movie_10 183.7 27


proc format;

value ratingfmt

low -< 6 = 'Poor'

6 -< 8 = 'Average'

8 - high = 'Excellent';

run;


proc freq data=world_cinema;

format IMDB_Rating ratingfmt.;

tables IMDB_Rating;

run;


Output:

                                                              The FREQ Procedure

IMDB_Rating Frequency Percent Cumulative
Frequency
Cumulative
Percent
Poor 44 22.00 44 22.00
Average 81 40.50 125 62.50
Excellent 75 37.50 200 100.00


proc sort data=world_cinema;

by Title;

run;


proc transpose data=world_cinema out=trans_movies;

by Title;

var Budget Box_Office;

run;

proc print data=trans_movies (obs=10);

run;


Output:

Obs Title _NAME_ COL1
1 Movie_1 Budget 38.9
2 Movie_1 Box_Office 72.1
3 Movie_10 Budget 87.5
4 Movie_10 Box_Office 183.7
5 Movie_100 Budget 87.3
6 Movie_100 Box_Office 222.2
7 Movie_101 Budget 77.0
8 Movie_101 Box_Office 193.4
9 Movie_102 Budget 76.4
10 Movie_102 Box_Office 185.6


data hits flops;

set world_cinema;

if Box_Office >= Budget*2 then output hits;

else if Box_Office < Budget then output flops;

run;


proc print data=hits (obs=5); title "Blockbusters"; run;

Output:

                                                                                    Blockbusters

Obs Title Genre Country Language Release_Year Budget Box_Office IMDB_Rating RT_Score Duration Awards_Won Screens
1 Movie_10 Horror Germany German 2023 87.5 183.7 9.2 97 102 8 1139
2 Movie_100 Comedy India German 2019 87.3 222.2 7.1 92 124 4 2632
3 Movie_101 Action France Korean 2015 77.0 193.4 7.1 53 114 7 1433
4 Movie_102 Thriller India Japanese 2019 76.4 185.6 6.3 53 96 9 761
5 Movie_103 Sci-Fi Brazil Hindi 2022 49.9 105.0 6.9 82 134 3 693


proc print data=flops (obs=5); title "Flop Shows"; run;

Log:

NOTE: No observations in data set WORK.FLOPS


proc ttest data=world_cinema;

class Country;

var IMDB_Rating;

where Country in ("USA", "India");

run;


Output:

                                                             The TTEST Procedure
  
                                                          Variable: IMDB_Rating

Country N Mean Std Dev Std Err Minimum Maximum
India 26 7.5500 1.2851 0.2520 5.4000 9.8000
USA 22 7.3000 1.5976 0.3406 5.0000 10.0000
Diff (1-2)   0.2500 1.4362 0.4160    


Country Method Mean 95% CL Mean Std Dev 95% CL Std Dev
India   7.5500 7.0309 8.0691 1.2851 1.0078 1.7739
USA   7.3000 6.5917 8.0083 1.5976 1.2291 2.2831
Diff (1-2) Pooled 0.2500 -0.5875 1.0875 1.4362 1.1935 1.8039
Diff (1-2) Satterthwaite 0.2500 -0.6062 1.1062      


Method Variances DF t Value Pr > |t|
Pooled Equal 46 0.60 0.5509
Satterthwaite Unequal 40.173 0.59 0.5585


Equality of Variances
Method Num DF Den DF F Value Pr > F
Folded F 21 25 1.55 0.2967


proc reg data=world_cinema;

model Box_Office = Budget Screens Duration IMDB_Rating;

title "Predicting Box Office Revenue";

run;


Output:

                                                        Predicting Box Office Revenue

                                                             The REG Procedure
                                                               Model: MODEL1
                                                   Dependent Variable: Box_Office

Number of Observations Read 200
Number of Observations Used 200


Analysis of Variance
Source DF Sum of
Squares
Mean
Square
F Value Pr > F
Model 4 150067 37517 21.36 <.0001
Error 195 342467 1756.24116    
Corrected Total 199 492534      


Root MSE 41.90753 R-Square 0.3047
Dependent Mean 125.93950 Adj R-Sq 0.2904
Coeff Var 33.27592    


Parameter Estimates
Variable DF Parameter
Estimate
Standard
Error
t Value Pr > |t|
Intercept 1 89.12079 27.37966 3.25 0.0013
Budget 1 0.99807 0.11039 9.04 <.0001
Screens 1 0.00353 0.00372 0.95 0.3445
Duration 1 -0.24684 0.17797 -1.39 0.1670
IMDB_Rating 1 0.99191 2.09711 0.47 0.6368


proc export data=world_cinema

outfile="/folders/myfolders/world_cinema.xlsx"

dbms=xlsx

replace;

run;


PRACTICE AND COMMENT YOUR CODE: 

-->PLEASE FOLLOW OUR BLOG FOR MORE UPDATES.

TO FOLLOW OUR TELEGRAM CHANNEL CLICK HERE

Comments