TENNIS PLAYERS PERFORMANCE STUDY | DATA STEP DERIVATIONS | PROC SQL SUMMARIES | PROC EXPAND TIME-SERIES ANALYSIS | PROC SGPLOT VISUALIZATION | MACRO AUTOMATION
1) Create master dataset
options validvarname=any nocenter;
data work.tennis_players;
infile datalines dlm=',' dsd truncover;
length PlayerID $4 Name $30 Country $20 Hand $1;
format DOB ddmmyy10.;
input PlayerID :$4. Name :$30. Country :$20. DOB :ddmmyy10. Height_cm Hand :$1.;
datalines;
P001,"Rafael Nadal","Spain",03-06-1986,185,R
P002,"Novak Djokovic","Serbia",22-05-1987,188,R
P003,"Roger Federer","Switzerland",08-08-1981,185,R
P004,"Andy Murray","United Kingdom",15-05-1987,190,R
P005,"Serena Williams","USA",26-09-1981,175,R
P006,"Naomi Osaka","Japan",16-10-1997,180,R
P007,"Ashleigh Barty","Australia",24-04-1996,166,R
P008,"Simona Halep","Romania",27-09-1991,168,R
P009,"Dominic Thiem","Austria",03-09-1993,185,R
P010,"Alexander Zverev","Germany",20-04-1997,198,R
P011,"Iga Swiatek","Poland",31-05-2001,175,R
P012,"Carlos Alcaraz","Spain",05-05-2003,185,R
;
run;
proc print data=work.tennis_players noobs;
run;
OUTPUT:
| PlayerID | Name | Country | Hand | DOB | Height_cm |
|---|---|---|---|---|---|
| P001 | Rafael Nadal | Spain | R | 03/06/1986 | 185 |
| P002 | Novak Djokovic | Serbia | R | 22/05/1987 | 188 |
| P003 | Roger Federer | Switzerland | R | 08/08/1981 | 185 |
| P004 | Andy Murray | United Kingdom | R | 15/05/1987 | 190 |
| P005 | Serena Williams | USA | R | 26/09/1981 | 175 |
| P006 | Naomi Osaka | Japan | R | 16/10/1997 | 180 |
| P007 | Ashleigh Barty | Australia | R | 24/04/1996 | 166 |
| P008 | Simona Halep | Romania | R | 27/09/1991 | 168 |
| P009 | Dominic Thiem | Austria | R | 03/09/1993 | 185 |
| P010 | Alexander Zverev | Germany | R | 20/04/1997 | 198 |
| P011 | Iga Swiatek | Poland | R | 31/05/2001 | 175 |
| P012 | Carlos Alcaraz | Spain | R | 05/05/2003 | 185 |
2) DATA step derivations: Age (as of a fixed date), AgeGroup, Height_m
data work.tennis_players_deriv;
set work.tennis_players;
/* Reference date is explicit to make results reproducible */
ref_date = '11NOV2025'd;
/* Calculate integer age at ref_date accurately */
years = intck('year', DOB, ref_date);
/* Adjust if birthday hasn't occurred yet in reference year */
if month(ref_date) < month(DOB) or (month(ref_date)=month(DOB) and
day(ref_date)<day(DOB)) then Age = years - 1;
else Age = years;
/* Height in meters */
Height_m = round(Height_cm / 100, 0.01);
/* Age group */
if Age < 20 then AgeGroup = 'Under20';
else if 20 <= Age < 25 then AgeGroup = '20-24';
else if 25 <= Age < 30 then AgeGroup = '25-29';
else if 30 <= Age < 35 then AgeGroup = '30-34';
else AgeGroup = '35plus';
/* Drop helper variable years, keep ref_date for traceability */
drop years;
run;
proc print data=work.tennis_players_deriv noobs;
var PlayerID Name Country DOB Age AgeGroup Height_cm Height_m Hand ref_date;
run;
OUTPUT:
| PlayerID | Name | Country | DOB | Age | AgeGroup | Height_cm | Height_m | Hand | ref_date |
|---|---|---|---|---|---|---|---|---|---|
| P001 | Rafael Nadal | Spain | 03/06/1986 | 39 | 35plus | 185 | 1.85 | R | 24056 |
| P002 | Novak Djokovic | Serbia | 22/05/1987 | 38 | 35plus | 188 | 1.88 | R | 24056 |
| P003 | Roger Federer | Switzerland | 08/08/1981 | 44 | 35plus | 185 | 1.85 | R | 24056 |
| P004 | Andy Murray | United Kingdom | 15/05/1987 | 38 | 35plus | 190 | 1.90 | R | 24056 |
| P005 | Serena Williams | USA | 26/09/1981 | 44 | 35plus | 175 | 1.75 | R | 24056 |
| P006 | Naomi Osaka | Japan | 16/10/1997 | 28 | 25-29 | 180 | 1.80 | R | 24056 |
| P007 | Ashleigh Barty | Australia | 24/04/1996 | 29 | 25-29 | 166 | 1.66 | R | 24056 |
| P008 | Simona Halep | Romania | 27/09/1991 | 34 | 30-34 | 168 | 1.68 | R | 24056 |
| P009 | Dominic Thiem | Austria | 03/09/1993 | 32 | 30-34 | 185 | 1.85 | R | 24056 |
| P010 | Alexander Zverev | Germany | 20/04/1997 | 28 | 25-29 | 198 | 1.98 | R | 24056 |
| P011 | Iga Swiatek | Poland | 31/05/2001 | 24 | 20-24 | 175 | 1.75 | R | 24056 |
| P012 | Carlos Alcaraz | Spain | 05/05/2003 | 22 | 20-24 | 185 | 1.85 | R | 24056 |
3) PROC SQL summaries: counts, averages, country-level rollups
proc sql;
create table work.country_summary as
select Country,
count(*) as NumPlayers,
mean(Age) as AvgAge format=6.2,
min(Age) as Youngest,
max(Age) as Oldest,
mean(Height_cm) as AvgHeight_cm format=6.1
from work.tennis_players_deriv
group by Country
order by NumPlayers desc;
quit;
proc print data=work.country_summary noobs;
run;
OUTPUT:
| Country | NumPlayers | AvgAge | Youngest | Oldest | AvgHeight_cm |
|---|---|---|---|---|---|
| Spain | 2 | 30.50 | 22 | 39 | 185.0 |
| Romania | 1 | 34.00 | 34 | 34 | 168.0 |
| Germany | 1 | 28.00 | 28 | 28 | 198.0 |
| Serbia | 1 | 38.00 | 38 | 38 | 188.0 |
| USA | 1 | 44.00 | 44 | 44 | 175.0 |
| Poland | 1 | 24.00 | 24 | 24 | 175.0 |
| United Kingdom | 1 | 38.00 | 38 | 38 | 190.0 |
| Australia | 1 | 29.00 | 29 | 29 | 166.0 |
| Austria | 1 | 32.00 | 32 | 32 | 185.0 |
| Japan | 1 | 28.00 | 28 | 28 | 180.0 |
| Switzerland | 1 | 44.00 | 44 | 44 | 185.0 |
4) Create synthetic monthly ranking points time series for 3 players
/* Choose 3 players to demonstrate time-series: P002, P001, P012 */
data work.rankings;
length PlayerID $4;
format Month yymmn6.;
/* Loop through player IDs */
do PlayerID = 'P002','P001','P012';
/* Assign a numeric ID for each player */
select (PlayerID);
when ('P002') player_num = 1;
when ('P001') player_num = 2;
when ('P012') player_num = 3;
otherwise player_num = 0;
end;
/* Generate 12 months of data per player */
do m = 0 to 11;
Month = intnx('month', '01JAN2025'd, m, 'begin');
/* Deterministic seed for reproducible randomness */
seed = 12345 + player_num*1000 + m;
call streaminit(seed);
/* Different base points per player */
base = 8000 + player_num*250;
/* Monthly seasonal and random variation */
seasonal = 200 * sin(2*3.14159*m/12) + 50*mod(m,3);
noise = round(rand('NORMAL', 0, 150), 1);
/* Final player points */
Points = round(base + seasonal + noise, 1);
output;
end;
end;
/* Drop intermediate variables */
drop m seed base seasonal noise player_num;
run;
proc print data=work.rankings(obs=12) noobs;
run;
OUTPUT:
| PlayerID | Month | Points |
|---|---|---|
| P002 | 202501 | 8281 |
| P002 | 202502 | 8363 |
| P002 | 202503 | 8752 |
| P002 | 202504 | 8285 |
| P002 | 202505 | 8585 |
| P002 | 202506 | 8330 |
| P002 | 202507 | 8127 |
| P002 | 202508 | 8151 |
| P002 | 202509 | 8147 |
| P002 | 202510 | 7846 |
| P002 | 202511 | 7933 |
| P002 | 202512 | 8413 |
5) PROC EXPAND: compute 3-month and 6-month moving averages (SAS/ETS required)
proc sort data=work.rankings;
by PlayerID Month;
run;
proc print data=work.rankings;
run;
OUTPUT:
| Obs | PlayerID | Month | Points |
|---|---|---|---|
| 1 | P001 | 202501 | 8586 |
| 2 | P001 | 202502 | 8507 |
| 3 | P001 | 202503 | 8515 |
| 4 | P001 | 202504 | 8618 |
| 5 | P001 | 202505 | 8561 |
| 6 | P001 | 202506 | 8671 |
| 7 | P001 | 202507 | 8601 |
| 8 | P001 | 202508 | 8215 |
| 9 | P001 | 202509 | 8357 |
| 10 | P001 | 202510 | 8174 |
| 11 | P001 | 202511 | 8417 |
| 12 | P001 | 202512 | 8366 |
| 13 | P002 | 202501 | 8281 |
| 14 | P002 | 202502 | 8363 |
| 15 | P002 | 202503 | 8752 |
| 16 | P002 | 202504 | 8285 |
| 17 | P002 | 202505 | 8585 |
| 18 | P002 | 202506 | 8330 |
| 19 | P002 | 202507 | 8127 |
| 20 | P002 | 202508 | 8151 |
| 21 | P002 | 202509 | 8147 |
| 22 | P002 | 202510 | 7846 |
| 23 | P002 | 202511 | 7933 |
| 24 | P002 | 202512 | 8413 |
| 25 | P012 | 202501 | 8512 |
| 26 | P012 | 202502 | 8507 |
| 27 | P012 | 202503 | 9154 |
| 28 | P012 | 202504 | 8888 |
| 29 | P012 | 202505 | 8746 |
| 30 | P012 | 202506 | 9161 |
| 31 | P012 | 202507 | 8466 |
| 32 | P012 | 202508 | 8588 |
| 33 | P012 | 202509 | 8917 |
| 34 | P012 | 202510 | 8585 |
| 35 | P012 | 202511 | 8682 |
| 36 | P012 | 202512 | 8861 |
proc expand data=work.rankings out=work.rankings_ma method=none;
by PlayerID;
id Month;
convert Points = MA_3 / transformout=(movave 3);
convert Points = MA_6 / transformout=(movave 6);
run;
proc print data=work.rankings_ma(obs=18) noobs;
run;
OUTPUT:
| PlayerID | Month | MA_3 | MA_6 | Points |
|---|---|---|---|---|
| P001 | 202501 | 8586.00 | 8586.00 | 8586 |
| P001 | 202502 | 8546.50 | 8546.50 | 8507 |
| P001 | 202503 | 8536.00 | 8536.00 | 8515 |
| P001 | 202504 | 8546.67 | 8556.50 | 8618 |
| P001 | 202505 | 8564.67 | 8557.40 | 8561 |
| P001 | 202506 | 8616.67 | 8576.33 | 8671 |
| P001 | 202507 | 8611.00 | 8578.83 | 8601 |
| P001 | 202508 | 8495.67 | 8530.17 | 8215 |
| P001 | 202509 | 8391.00 | 8503.83 | 8357 |
| P001 | 202510 | 8248.67 | 8429.83 | 8174 |
| P001 | 202511 | 8316.00 | 8405.83 | 8417 |
| P001 | 202512 | 8319.00 | 8355.00 | 8366 |
| P002 | 202501 | 8281.00 | 8281.00 | 8281 |
| P002 | 202502 | 8322.00 | 8322.00 | 8363 |
| P002 | 202503 | 8465.33 | 8465.33 | 8752 |
| P002 | 202504 | 8466.67 | 8420.25 | 8285 |
| P002 | 202505 | 8540.67 | 8453.20 | 8585 |
| P002 | 202506 | 8400.00 | 8432.67 | 8330 |
6) Plot time series and moving averages with PROC SGPLOT
proc sgpanel data=work.rankings_ma;
panelby PlayerID / layout=panel onepanel;
series x=Month y=Points / markers;
series x=Month y=MA_3 / lineattrs=(pattern=2) legendlabel='3-Month MA';
series x=Month y=MA_6 / lineattrs=(pattern=3) legendlabel='6-Month MA';
rowaxis label="Ranking Points";
colaxis label="Month";
title "Ranking Points and Moving Averages by Player";
run;
OUTPUT:
7) Macro examples:
- %derive_core(ds=, out=) : run same derivations on any dataset with DOB, Height_cm
- %player_movingavg(pid=, out=): generate moving average for a single player using PROC EXPAND
%macro derive_core(ds=work.tennis_players, out=);
data &out.;
set &ds.;
ref_date='11NOV2025'd;
years = intck('year', DOB, ref_date);
if month(ref_date) < month(DOB) or (month(ref_date)=month(DOB) and day(ref_date)<day(DOB))
then Age = years - 1;
else Age = years;
Height_m = round(Height_cm/100,0.01);
if Age < 20 then AgeGroup='Under20';
else if 20 <= Age < 25 then AgeGroup='20-24';
else if 25 <= Age < 30 then AgeGroup='25-29';
else if 30 <= Age < 35 then AgeGroup='30-34';
else AgeGroup='35plus';
drop years;
run;
proc print;run;
%mend derive_core;
%derive_core(ds=work.tennis_players, out=work.tennis_players_macro);
OUTPUT:
| Obs | PlayerID | Name | Country | Hand | DOB | Height_cm | ref_date | Age | Height_m | AgeGroup |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | P001 | Rafael Nadal | Spain | R | 03/06/1986 | 185 | 24056 | 39 | 1.85 | 35plus |
| 2 | P002 | Novak Djokovic | Serbia | R | 22/05/1987 | 188 | 24056 | 38 | 1.88 | 35plus |
| 3 | P003 | Roger Federer | Switzerland | R | 08/08/1981 | 185 | 24056 | 44 | 1.85 | 35plus |
| 4 | P004 | Andy Murray | United Kingdom | R | 15/05/1987 | 190 | 24056 | 38 | 1.90 | 35plus |
| 5 | P005 | Serena Williams | USA | R | 26/09/1981 | 175 | 24056 | 44 | 1.75 | 35plus |
| 6 | P006 | Naomi Osaka | Japan | R | 16/10/1997 | 180 | 24056 | 28 | 1.80 | 25-29 |
| 7 | P007 | Ashleigh Barty | Australia | R | 24/04/1996 | 166 | 24056 | 29 | 1.66 | 25-29 |
| 8 | P008 | Simona Halep | Romania | R | 27/09/1991 | 168 | 24056 | 34 | 1.68 | 30-34 |
| 9 | P009 | Dominic Thiem | Austria | R | 03/09/1993 | 185 | 24056 | 32 | 1.85 | 30-34 |
| 10 | P010 | Alexander Zverev | Germany | R | 20/04/1997 | 198 | 24056 | 28 | 1.98 | 25-29 |
| 11 | P011 | Iga Swiatek | Poland | R | 31/05/2001 | 175 | 24056 | 24 | 1.75 | 20-24 |
| 12 | P012 | Carlos Alcaraz | Spain | R | 05/05/2003 | 185 | 24056 | 22 | 1.85 | 20-24 |
/* Macro to compute moving averages for a single player ID (convenience wrapper) */
%macro player_movingavg(pid=P002, out=work.pid_ma);
data _tmp;
set work.rankings;
where PlayerID="&pid.";
run;
proc print;run;
proc sort data=_tmp;
by Month;
run;
proc expand data=_tmp out=&out. method=none;
id Month;
convert Points = MA_3 / transformout=(movave 3);
convert Points = MA_6 / transformout=(movave 6);
run;
%mend player_movingavg;
%player_movingavg(pid=P001, out=work.P001_ma);
OUTPUT:
| Obs | PlayerID | Month | Points |
|---|---|---|---|
| 1 | P001 | 202501 | 8586 |
| 2 | P001 | 202502 | 8507 |
| 3 | P001 | 202503 | 8515 |
| 4 | P001 | 202504 | 8618 |
| 5 | P001 | 202505 | 8561 |
| 6 | P001 | 202506 | 8671 |
| 7 | P001 | 202507 | 8601 |
| 8 | P001 | 202508 | 8215 |
| 9 | P001 | 202509 | 8357 |
| 10 | P001 | 202510 | 8174 |
| 11 | P001 | 202511 | 8417 |
| 12 | P001 | 202512 | 8366 |
proc print data=work.P001_ma noobs;
run;
OUTPUT:
| Month | MA_3 | MA_6 | PlayerID | Points |
|---|---|---|---|---|
| 202501 | 8586.00 | 8586.00 | P001 | 8586 |
| 202502 | 8546.50 | 8546.50 | P001 | 8507 |
| 202503 | 8536.00 | 8536.00 | P001 | 8515 |
| 202504 | 8546.67 | 8556.50 | P001 | 8618 |
| 202505 | 8564.67 | 8557.40 | P001 | 8561 |
| 202506 | 8616.67 | 8576.33 | P001 | 8671 |
| 202507 | 8611.00 | 8578.83 | P001 | 8601 |
| 202508 | 8495.67 | 8530.17 | P001 | 8215 |
| 202509 | 8391.00 | 8503.83 | P001 | 8357 |
| 202510 | 8248.67 | 8429.83 | P001 | 8174 |
| 202511 | 8316.00 | 8405.83 | P001 | 8417 |
| 202512 | 8319.00 | 8355.00 | P001 | 8366 |
No comments:
Post a Comment