1 Data Processing - Data_Processing.R

1.1 Preprocessing

Library Loading

library(cricketdata)
library(dplyr)
library(pROC)
library(caret)
library(glmnet)
library(ggplot2)
library(stats)
library(knitr)
library(lsr)
library(datasets)
batting_data <- read.csv("../data/battinginningdata.csv")  # Gets from working directory to batting innings data
bowling_data <- read.csv("../data/bowlinginningdata.csv")  # Gets from working directory to bowling innings data
match_data_file_cricinfo = read.csv("../data/MatchData-Cricinfo.csv") # Gets match result data from Cricinfo results file

Create Country Acronym Mapping - 12 Test-playing nations

country_acronyms <- c(
  "Australia" = "AUS",
  "India" = "IND",
  "South Africa" = "RSA",
  "England" = "ENG",
  "New Zealand" = "NZL",
  "West Indies" = "WI",
  "Pakistan" = "PAK",
  "Sri Lanka" = "SL",
  "Afghanistan" = "AFG",
  "Bangladesh" = "BAN",
  "Ireland" = "IRE",
  "Zimbabwe" = "ZIM"
  # Add more as needed
)
countries = c("Australia", "India", "South Africa", "England", "New Zealand", "West Indies", "Pakistan", "Sri Lanka", "Afghanistan", "Bangladesh", "Ireland", "Zimbabwe")

Remove countries not in country list, change BAN to Bangladesh

batting_data <- batting_data %>% mutate(Country = ifelse(Country == "BAN", "Bangladesh", Country))
bowling_data <- bowling_data %>% mutate(Country = ifelse(Country == "BAN", "Bangladesh", Country))
match_data_file_cricinfo <- match_data_file_cricinfo %>% filter(Team.1 %in% countries & Team.2 %in% countries & Winner %in% countries)
batting_data <- batting_data %>% filter(Country %in% countries & Opposition %in% countries)
bowling_data <- bowling_data %>% filter(Country %in% countries & Opposition %in% countries)

Replace with acronym

match_data_file_cricinfo$Team.1 <- country_acronyms[match_data_file_cricinfo$Team.1]
match_data_file_cricinfo$Team.2 <- country_acronyms[match_data_file_cricinfo$Team.2]
match_data_file_cricinfo$Winner <- country_acronyms[match_data_file_cricinfo$Winner]
batting_data$Country <- country_acronyms[batting_data$Country]
batting_data$Opposition <- country_acronyms[batting_data$Opposition]
bowling_data$Country <- country_acronyms[bowling_data$Country]
bowling_data$Opposition <- country_acronyms[bowling_data$Opposition]


match_data_file_cricinfo$Date <- as.Date.character(match_data_file_cricinfo$Match.Date, format = "%b %d, %Y")
batting_data$Date <- as.Date(batting_data$Date)
bowling_data$Date <- as.Date(bowling_data$Date)

batting_data$Team = batting_data$Country
bowling_data$Team = bowling_data$Country
batting_data$Opponent = batting_data$Opposition
bowling_data$Opponent = bowling_data$Opposition

Create a combined Team column in match_data_file_cricinfo

match_data_file_cricinfo <- match_data_file_cricinfo %>%
  mutate(Team = ifelse(Team.1 < Team.2, paste(Team.1, Team.2), paste(Team.2, Team.1)))

Create a combined Team column in batting_data and bowling_data

batting_data <- batting_data %>%
  mutate(Team = ifelse(Country < Opposition, paste(Country, Opposition), paste(Opposition, Country)))
bowling_data <- bowling_data %>%
  mutate(Team = ifelse(Country < Opposition, paste(Country, Opposition), paste(Opposition, Country)))

Print some info for debugging

print("Unique Dates in match_data_file_cricinfo:")
print(unique(match_data_file_cricinfo$Date))
print("Unique Dates in batting_data:")
print(unique(batting_data$Date))
print("Unique Dates in bowling_data:")
print(unique(bowling_data$Date))

print("Unique Teams in match_data_file_cricinfo:")
print(unique(match_data_file_cricinfo$Team))
print("Unique Teams in batting_data:")
print(unique(batting_data$Team))
print("Unique Teams in bowling_data:")
print(unique(bowling_data$Team))

Join data by match

batting_data <- left_join(batting_data, dplyr::select(match_data_file_cricinfo, ID, Date, Team), by = c("Date", "Team"))
bowling_data <- left_join(bowling_data, dplyr::select(match_data_file_cricinfo, ID, Date, Team), by = c("Date", "Team"))

Compute Batting Impact

batting_data <- batting_data %>%
  mutate(OutPenalty = ifelse(NotOut == FALSE, 0.5, 0),
         BattingImpact = Runs * 0.125 + ((StrikeRate - 130) * 0.025) + (Fours * 0.3) + (Sixes * 0.5) - OutPenalty)

Compute Bowling Impact

bowling_data <- bowling_data %>%
  mutate(BowlingImpact = (Wickets * 3.25) + (Maidens * 3) - (Economy - 7.4) * 0.5)

1.2 Distribution Comparison - Verify Batting and Bowling Impact are Similar

Aggregate Batting and Bowling Impact by Player and Country, Plot Distribution

batting_impact <- batting_data %>%
  group_by(ID, Country) %>%
  summarise(TotalBattingImpact = sum(BattingImpact, na.rm = TRUE)) %>%
  ungroup()

bowling_impact <- bowling_data %>%
  group_by(ID, Country) %>%
  summarise(TotalBowlingImpact = sum(BowlingImpact, na.rm = TRUE)) %>%
  ungroup()

hist(bowling_impact$TotalBowlingImpact, breaks = 50, main = "Bowling and Batting Impact Distribution", xlab = "Total Impact", col = rgb(0,0,1,1/4))
hist(batting_impact$TotalBattingImpact, breaks = 50, main = "Batting Impact Distribution", xlab = "Total Batting Impact", col = rgb(1,0,0,1/4), add = TRUE)

Kolmogorov-Smirnov Test with p > 0.05 verifies that we cannot reject the null hypothesis that the two distributions are the same, with almost 0 evidence (p = 0.37)

ks.test(bowling_impact$TotalBowlingImpact, batting_impact$TotalBattingImpact)
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  bowling_impact$TotalBowlingImpact and batting_impact$TotalBattingImpact
## D = 0.028116, p-value = 0.3678
## alternative hypothesis: two-sided

QQ Plot to visually compare the distributions

qqplot(
  quantile(bowling_impact$TotalBowlingImpact, probs = seq(0.025, 0.975, length.out = 1000)),
  quantile(batting_impact$TotalBattingImpact, probs = seq(0.025, 0.975, length.out = 1000)),
  xlab = "Bowling Impact Quantiles",
  ylab = "Batting Impact Quantiles",
  main = "QQ Plot: Bowling vs Batting Impact",
  pch = 19, col = "purple"
)
abline(0, 1, col = "red", lwd = 2)  # line of equality

Calculate Cohen’s d to measure that the effect size is minimal (d < 0.1)

cohensD(batting_impact$TotalBattingImpact, bowling_impact$TotalBowlingImpact)
## [1] 0.02783777

Make bowling impact have batting data mean and sd

bowling_impact$TotalBowlingImpact = (bowling_impact$TotalBowlingImpact)
batting_impact$TotalBattingImpact = (batting_impact$TotalBattingImpact)

Merge Batting & Bowling Impact

team_impact <- batting_impact %>%
  full_join(bowling_impact, by = c("ID", "Country")) %>%
  mutate(TotalImpact = TotalBattingImpact + TotalBowlingImpact)

1.3 Merge with Match Data and Measure Correlation with Match Outcome

Don’t run commented line below on first run, but can be done afterward to reset columns

#match_data_file_cricinfo[, 10:16] <- list(NULL)
match_data_file_cricinfo <- match_data_file_cricinfo %>%
  left_join(team_impact, by = c("ID" = "ID", "Team.1" = "Country")) %>%
  rename(Impact_Team1 = TotalImpact) %>%
  left_join(team_impact, by = c("ID" = "ID", "Team.2" = "Country")) %>%
  rename(Impact_Team2 = TotalImpact)

Compute Impact Difference & Outcome Variable

match_data_file_cricinfo <- match_data_file_cricinfo %>%
  mutate(Impact_Diff = Impact_Team1 - Impact_Team2,
         Winner_Team1 = ifelse(Winner == Team.1, 1, 0))

Print the final correlation and R-squared value

print(paste("Final correlation: ", cor(match_data_file_cricinfo$Impact_Diff, match_data_file_cricinfo$Winner_Team1, use = "complete.obs")))
## [1] "Final correlation:  0.774106530488118"
rsquared = cor(match_data_file_cricinfo$Impact_Diff, match_data_file_cricinfo$Winner_Team1, use = "complete.obs")^2
cor.test(match_data_file_cricinfo$Impact_Diff, match_data_file_cricinfo$Winner_Team1, use = "complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  match_data_file_cricinfo$Impact_Diff and match_data_file_cricinfo$Winner_Team1
## t = 39.793, df = 1059, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7488087 0.7971521
## sample estimates:
##       cor 
## 0.7741065
print(paste("Final R-squared: ", rsquared))
## [1] "Final R-squared:  0.599240920544352"