306.Are players with higher match consistency always the ones winning major tournaments?A Complete Sas Study
Are players with higher match consistency always the ones winning major tournaments?A Complete Sas Study
1) Create master dataset
options validvarname=any nocenter;
data work.tennis_players;
infile datalines dlm=',' dsd truncover;
length PlayerID $4 Name $30 Country $20 Hand $1;
format DOB ddmmyy10.;
input PlayerID :$4. Name :$30. Country :$20. DOB :ddmmyy10. Height_cm Hand :$1.;
datalines;
P001,"Rafael Nadal","Spain",03-06-1986,185,R
P002,"Novak Djokovic","Serbia",22-05-1987,188,R
P003,"Roger Federer","Switzerland",08-08-1981,185,R
P004,"Andy Murray","United Kingdom",15-05-1987,190,R
P005,"Serena Williams","USA",26-09-1981,175,R
P006,"Naomi Osaka","Japan",16-10-1997,180,R
P007,"Ashleigh Barty","Australia",24-04-1996,166,R
P008,"Simona Halep","Romania",27-09-1991,168,R
P009,"Dominic Thiem","Austria",03-09-1993,185,R
P010,"Alexander Zverev","Germany",20-04-1997,198,R
P011,"Iga Swiatek","Poland",31-05-2001,175,R
P012,"Carlos Alcaraz","Spain",05-05-2003,185,R
;
run;
proc print data=work.tennis_players noobs;
run;
OUTPUT:
| PlayerID | Name | Country | Hand | DOB | Height_cm |
|---|---|---|---|---|---|
| P001 | Rafael Nadal | Spain | R | 03/06/1986 | 185 |
| P002 | Novak Djokovic | Serbia | R | 22/05/1987 | 188 |
| P003 | Roger Federer | Switzerland | R | 08/08/1981 | 185 |
| P004 | Andy Murray | United Kingdom | R | 15/05/1987 | 190 |
| P005 | Serena Williams | USA | R | 26/09/1981 | 175 |
| P006 | Naomi Osaka | Japan | R | 16/10/1997 | 180 |
| P007 | Ashleigh Barty | Australia | R | 24/04/1996 | 166 |
| P008 | Simona Halep | Romania | R | 27/09/1991 | 168 |
| P009 | Dominic Thiem | Austria | R | 03/09/1993 | 185 |
| P010 | Alexander Zverev | Germany | R | 20/04/1997 | 198 |
| P011 | Iga Swiatek | Poland | R | 31/05/2001 | 175 |
| P012 | Carlos Alcaraz | Spain | R | 05/05/2003 | 185 |
2) DATA step derivations: Age (as of a fixed date), AgeGroup, Height_m
data work.tennis_players_deriv;
set work.tennis_players;
/* Reference date is explicit to make results reproducible */
ref_date = '11NOV2025'd;
/* Calculate integer age at ref_date accurately */
years = intck('year', DOB, ref_date);
/* Adjust if birthday hasn't occurred yet in reference year */
if month(ref_date) < month(DOB) or (month(ref_date)=month(DOB) and
day(ref_date)<day(DOB)) then Age = years - 1;
else Age = years;
/* Height in meters */
Height_m = round(Height_cm / 100, 0.01);
/* Age group */
if Age < 20 then AgeGroup = 'Under20';
else if 20 <= Age < 25 then AgeGroup = '20-24';
else if 25 <= Age < 30 then AgeGroup = '25-29';
else if 30 <= Age < 35 then AgeGroup = '30-34';
else AgeGroup = '35plus';
/* Drop helper variable years, keep ref_date for traceability */
drop years;
run;
proc print data=work.tennis_players_deriv noobs;
var PlayerID Name Country DOB Age AgeGroup Height_cm Height_m Hand ref_date;
run;
OUTPUT:
| PlayerID | Name | Country | DOB | Age | AgeGroup | Height_cm | Height_m | Hand | ref_date |
|---|---|---|---|---|---|---|---|---|---|
| P001 | Rafael Nadal | Spain | 03/06/1986 | 39 | 35plus | 185 | 1.85 | R | 24056 |
| P002 | Novak Djokovic | Serbia | 22/05/1987 | 38 | 35plus | 188 | 1.88 | R | 24056 |
| P003 | Roger Federer | Switzerland | 08/08/1981 | 44 | 35plus | 185 | 1.85 | R | 24056 |
| P004 | Andy Murray | United Kingdom | 15/05/1987 | 38 | 35plus | 190 | 1.90 | R | 24056 |
| P005 | Serena Williams | USA | 26/09/1981 | 44 | 35plus | 175 | 1.75 | R | 24056 |
| P006 | Naomi Osaka | Japan | 16/10/1997 | 28 | 25-29 | 180 | 1.80 | R | 24056 |
| P007 | Ashleigh Barty | Australia | 24/04/1996 | 29 | 25-29 | 166 | 1.66 | R | 24056 |
| P008 | Simona Halep | Romania | 27/09/1991 | 34 | 30-34 | 168 | 1.68 | R | 24056 |
| P009 | Dominic Thiem | Austria | 03/09/1993 | 32 | 30-34 | 185 | 1.85 | R | 24056 |
| P010 | Alexander Zverev | Germany | 20/04/1997 | 28 | 25-29 | 198 | 1.98 | R | 24056 |
| P011 | Iga Swiatek | Poland | 31/05/2001 | 24 | 20-24 | 175 | 1.75 | R | 24056 |
| P012 | Carlos Alcaraz | Spain | 05/05/2003 | 22 | 20-24 | 185 | 1.85 | R | 24056 |
3) PROC SQL summaries: counts, averages, country-level rollups
proc sql;
create table work.country_summary as
select Country,
count(*) as NumPlayers,
mean(Age) as AvgAge format=6.2,
min(Age) as Youngest,
max(Age) as Oldest,
mean(Height_cm) as AvgHeight_cm format=6.1
from work.tennis_players_deriv
group by Country
order by NumPlayers desc;
quit;
proc print data=work.country_summary noobs;
run;
OUTPUT:
| Country | NumPlayers | AvgAge | Youngest | Oldest | AvgHeight_cm |
|---|---|---|---|---|---|
| Spain | 2 | 30.50 | 22 | 39 | 185.0 |
| Romania | 1 | 34.00 | 34 | 34 | 168.0 |
| Germany | 1 | 28.00 | 28 | 28 | 198.0 |
| Serbia | 1 | 38.00 | 38 | 38 | 188.0 |
| USA | 1 | 44.00 | 44 | 44 | 175.0 |
| Poland | 1 | 24.00 | 24 | 24 | 175.0 |
| United Kingdom | 1 | 38.00 | 38 | 38 | 190.0 |
| Australia | 1 | 29.00 | 29 | 29 | 166.0 |
| Austria | 1 | 32.00 | 32 | 32 | 185.0 |
| Japan | 1 | 28.00 | 28 | 28 | 180.0 |
| Switzerland | 1 | 44.00 | 44 | 44 | 185.0 |
4) Create synthetic monthly ranking points time series for 3 players
/* Choose 3 players to demonstrate time-series: P002, P001, P012 */
data work.rankings;
length PlayerID $4;
format Month yymmn6.;
/* Loop through player IDs */
do PlayerID = 'P002','P001','P012';
/* Assign a numeric ID for each player */
select (PlayerID);
when ('P002') player_num = 1;
when ('P001') player_num = 2;
when ('P012') player_num = 3;
otherwise player_num = 0;
end;
/* Generate 12 months of data per player */
do m = 0 to 11;
Month = intnx('month', '01JAN2025'd, m, 'begin');
/* Deterministic seed for reproducible randomness */
seed = 12345 + player_num*1000 + m;
call streaminit(seed);
/* Different base points per player */
base = 8000 + player_num*250;
/* Monthly seasonal and random variation */
seasonal = 200 * sin(2*3.14159*m/12) + 50*mod(m,3);
noise = round(rand('NORMAL', 0, 150), 1);
/* Final player points */
Points = round(base + seasonal + noise, 1);
output;
end;
end;
/* Drop intermediate variables */
drop m seed base seasonal noise player_num;
run;
proc print data=work.rankings(obs=12) noobs;
run;
OUTPUT:
| PlayerID | Month | Points |
|---|---|---|
| P002 | 202501 | 8281 |
| P002 | 202502 | 8363 |
| P002 | 202503 | 8752 |
| P002 | 202504 | 8285 |
| P002 | 202505 | 8585 |
| P002 | 202506 | 8330 |
| P002 | 202507 | 8127 |
| P002 | 202508 | 8151 |
| P002 | 202509 | 8147 |
| P002 | 202510 | 7846 |
| P002 | 202511 | 7933 |
| P002 | 202512 | 8413 |
5) PROC EXPAND: compute 3-month and 6-month moving averages (SAS/ETS required)
proc sort data=work.rankings;
by PlayerID Month;
run;
proc print data=work.rankings;
run;
OUTPUT:
| Obs | PlayerID | Month | Points |
|---|---|---|---|
| 1 | P001 | 202501 | 8586 |
| 2 | P001 | 202502 | 8507 |
| 3 | P001 | 202503 | 8515 |
| 4 | P001 | 202504 | 8618 |
| 5 | P001 | 202505 | 8561 |
| 6 | P001 | 202506 | 8671 |
| 7 | P001 | 202507 | 8601 |
| 8 | P001 | 202508 | 8215 |
| 9 | P001 | 202509 | 8357 |
| 10 | P001 | 202510 | 8174 |
| 11 | P001 | 202511 | 8417 |
| 12 | P001 | 202512 | 8366 |
| 13 | P002 | 202501 | 8281 |
| 14 | P002 | 202502 | 8363 |
| 15 | P002 | 202503 | 8752 |
| 16 | P002 | 202504 | 8285 |
| 17 | P002 | 202505 | 8585 |
| 18 | P002 | 202506 | 8330 |
| 19 | P002 | 202507 | 8127 |
| 20 | P002 | 202508 | 8151 |
| 21 | P002 | 202509 | 8147 |
| 22 | P002 | 202510 | 7846 |
| 23 | P002 | 202511 | 7933 |
| 24 | P002 | 202512 | 8413 |
| 25 | P012 | 202501 | 8512 |
| 26 | P012 | 202502 | 8507 |
| 27 | P012 | 202503 | 9154 |
| 28 | P012 | 202504 | 8888 |
| 29 | P012 | 202505 | 8746 |
| 30 | P012 | 202506 | 9161 |
| 31 | P012 | 202507 | 8466 |
| 32 | P012 | 202508 | 8588 |
| 33 | P012 | 202509 | 8917 |
| 34 | P012 | 202510 | 8585 |
| 35 | P012 | 202511 | 8682 |
| 36 | P012 | 202512 | 8861 |
proc expand data=work.rankings out=work.rankings_ma method=none;
by PlayerID;
id Month;
convert Points = MA_3 / transformout=(movave 3);
convert Points = MA_6 / transformout=(movave 6);
run;
proc print data=work.rankings_ma(obs=18) noobs;
run;
OUTPUT:
| PlayerID | Month | MA_3 | MA_6 | Points |
|---|---|---|---|---|
| P001 | 202501 | 8586.00 | 8586.00 | 8586 |
| P001 | 202502 | 8546.50 | 8546.50 | 8507 |
| P001 | 202503 | 8536.00 | 8536.00 | 8515 |
| P001 | 202504 | 8546.67 | 8556.50 | 8618 |
| P001 | 202505 | 8564.67 | 8557.40 | 8561 |
| P001 | 202506 | 8616.67 | 8576.33 | 8671 |
| P001 | 202507 | 8611.00 | 8578.83 | 8601 |
| P001 | 202508 | 8495.67 | 8530.17 | 8215 |
| P001 | 202509 | 8391.00 | 8503.83 | 8357 |
| P001 | 202510 | 8248.67 | 8429.83 | 8174 |
| P001 | 202511 | 8316.00 | 8405.83 | 8417 |
| P001 | 202512 | 8319.00 | 8355.00 | 8366 |
| P002 | 202501 | 8281.00 | 8281.00 | 8281 |
| P002 | 202502 | 8322.00 | 8322.00 | 8363 |
| P002 | 202503 | 8465.33 | 8465.33 | 8752 |
| P002 | 202504 | 8466.67 | 8420.25 | 8285 |
| P002 | 202505 | 8540.67 | 8453.20 | 8585 |
| P002 | 202506 | 8400.00 | 8432.67 | 8330 |
6) Plot time series and moving averages with PROC SGPLOT
proc sgpanel data=work.rankings_ma;
panelby PlayerID / layout=panel onepanel;
series x=Month y=Points / markers;
series x=Month y=MA_3 / lineattrs=(pattern=2) legendlabel='3-Month MA';
series x=Month y=MA_6 / lineattrs=(pattern=3) legendlabel='6-Month MA';
rowaxis label="Ranking Points";
colaxis label="Month";
title "Ranking Points and Moving Averages by Player";
run;
OUTPUT:
7) Macro examples:
- %derive_core(ds=, out=) : run same derivations on any dataset with DOB, Height_cm
- %player_movingavg(pid=, out=): generate moving average for a single player using PROC EXPAND
%macro derive_core(ds=work.tennis_players, out=);
data &out.;
set &ds.;
ref_date='11NOV2025'd;
years = intck('year', DOB, ref_date);
if month(ref_date) < month(DOB) or (month(ref_date)=month(DOB) and day(ref_date)<day(DOB))
then Age = years - 1;
else Age = years;
Height_m = round(Height_cm/100,0.01);
if Age < 20 then AgeGroup='Under20';
else if 20 <= Age < 25 then AgeGroup='20-24';
else if 25 <= Age < 30 then AgeGroup='25-29';
else if 30 <= Age < 35 then AgeGroup='30-34';
else AgeGroup='35plus';
drop years;
run;
proc print;run;
%mend derive_core;
%derive_core(ds=work.tennis_players, out=work.tennis_players_macro);
OUTPUT:
| Obs | PlayerID | Name | Country | Hand | DOB | Height_cm | ref_date | Age | Height_m | AgeGroup |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | P001 | Rafael Nadal | Spain | R | 03/06/1986 | 185 | 24056 | 39 | 1.85 | 35plus |
| 2 | P002 | Novak Djokovic | Serbia | R | 22/05/1987 | 188 | 24056 | 38 | 1.88 | 35plus |
| 3 | P003 | Roger Federer | Switzerland | R | 08/08/1981 | 185 | 24056 | 44 | 1.85 | 35plus |
| 4 | P004 | Andy Murray | United Kingdom | R | 15/05/1987 | 190 | 24056 | 38 | 1.90 | 35plus |
| 5 | P005 | Serena Williams | USA | R | 26/09/1981 | 175 | 24056 | 44 | 1.75 | 35plus |
| 6 | P006 | Naomi Osaka | Japan | R | 16/10/1997 | 180 | 24056 | 28 | 1.80 | 25-29 |
| 7 | P007 | Ashleigh Barty | Australia | R | 24/04/1996 | 166 | 24056 | 29 | 1.66 | 25-29 |
| 8 | P008 | Simona Halep | Romania | R | 27/09/1991 | 168 | 24056 | 34 | 1.68 | 30-34 |
| 9 | P009 | Dominic Thiem | Austria | R | 03/09/1993 | 185 | 24056 | 32 | 1.85 | 30-34 |
| 10 | P010 | Alexander Zverev | Germany | R | 20/04/1997 | 198 | 24056 | 28 | 1.98 | 25-29 |
| 11 | P011 | Iga Swiatek | Poland | R | 31/05/2001 | 175 | 24056 | 24 | 1.75 | 20-24 |
| 12 | P012 | Carlos Alcaraz | Spain | R | 05/05/2003 | 185 | 24056 | 22 | 1.85 | 20-24 |
/* Macro to compute moving averages for a single player ID (convenience wrapper) */
%macro player_movingavg(pid=P002, out=work.pid_ma);
data _tmp;
set work.rankings;
where PlayerID="&pid.";
run;
proc print;run;
proc sort data=_tmp;
by Month;
run;
proc expand data=_tmp out=&out. method=none;
id Month;
convert Points = MA_3 / transformout=(movave 3);
convert Points = MA_6 / transformout=(movave 6);
run;
%mend player_movingavg;
%player_movingavg(pid=P001, out=work.P001_ma);
OUTPUT:
| Obs | PlayerID | Month | Points |
|---|---|---|---|
| 1 | P001 | 202501 | 8586 |
| 2 | P001 | 202502 | 8507 |
| 3 | P001 | 202503 | 8515 |
| 4 | P001 | 202504 | 8618 |
| 5 | P001 | 202505 | 8561 |
| 6 | P001 | 202506 | 8671 |
| 7 | P001 | 202507 | 8601 |
| 8 | P001 | 202508 | 8215 |
| 9 | P001 | 202509 | 8357 |
| 10 | P001 | 202510 | 8174 |
| 11 | P001 | 202511 | 8417 |
| 12 | P001 | 202512 | 8366 |
proc print data=work.P001_ma noobs;
run;
OUTPUT:
| Month | MA_3 | MA_6 | PlayerID | Points |
|---|---|---|---|---|
| 202501 | 8586.00 | 8586.00 | P001 | 8586 |
| 202502 | 8546.50 | 8546.50 | P001 | 8507 |
| 202503 | 8536.00 | 8536.00 | P001 | 8515 |
| 202504 | 8546.67 | 8556.50 | P001 | 8618 |
| 202505 | 8564.67 | 8557.40 | P001 | 8561 |
| 202506 | 8616.67 | 8576.33 | P001 | 8671 |
| 202507 | 8611.00 | 8578.83 | P001 | 8601 |
| 202508 | 8495.67 | 8530.17 | P001 | 8215 |
| 202509 | 8391.00 | 8503.83 | P001 | 8357 |
| 202510 | 8248.67 | 8429.83 | P001 | 8174 |
| 202511 | 8316.00 | 8405.83 | P001 | 8417 |
| 202512 | 8319.00 | 8355.00 | P001 | 8366 |
Comments
Post a Comment