4 Model Training and Evaluation - Cricket_Prediction.R
4.1 Creating Training Dataset
Create data frame for actual impact totals, predicted winner, and actual winner
playertotalimpact = data.frame(Batsman1_T1 = numeric(nummatches),
Batsman2_T1 = numeric(nummatches),
Batsman3_T1 = numeric(nummatches),
Batsman4_T1 = numeric(nummatches),
Batsman5_T1 = numeric(nummatches),
Batsman6_T1 = numeric(nummatches),
Batsman7_T1 = numeric(nummatches),
Batsman8_T1 = numeric(nummatches),
Bowler1_T1 = numeric(nummatches),
Bowler2_T1 = numeric(nummatches),
Bowler3_T1 = numeric(nummatches),
Bowler4_T1 = numeric(nummatches),
Bowler5_T1 = numeric(nummatches),
Bowler6_T1 = numeric(nummatches),
Batsman1_T2 = numeric(nummatches),
Batsman2_T2 = numeric(nummatches),
Batsman3_T2 = numeric(nummatches),
Batsman4_T2 = numeric(nummatches),
Batsman5_T2 = numeric(nummatches),
Batsman6_T2 = numeric(nummatches),
Batsman7_T2 = numeric(nummatches),
Batsman8_T2 = numeric(nummatches),
Bowler1_T2 = numeric(nummatches),
Bowler2_T2 = numeric(nummatches),
Bowler3_T2 = numeric(nummatches),
Bowler4_T2 = numeric(nummatches),
Bowler5_T2 = numeric(nummatches),
Bowler6_T2 = numeric(nummatches),
ID = numeric(nummatches))Create match counter
Loop through each match in match_data_file_cricinfo and calculate total impact for each player in the playing xi
for(id in match_data_file_cricinfo$ID){
#get the playing xi from player data
team1_playingxi = player_data %>% filter(id == ID)
team1_playingxi = team1_playingxi[1:22,]
team1_playingxi = team1_playingxi %>% filter(Country == (match_data_file_cricinfo %>% filter(id == ID))[1,1])
team1_playingxi = team1_playingxi$Player
team2_playingxi = player_data %>% filter(id == ID)
team2_playingxi = team2_playingxi[1:22,]
team2_playingxi = team2_playingxi %>% filter(Country == (match_data_file_cricinfo %>% filter(id == ID))[1,2])
team2_playingxi = team2_playingxi$Player
#get the ground buffs
ground_buffs = venue_factors %>% filter(Ground == (match_data_file_cricinfo %>% filter(id == ID))[1,5])
#get the player ratings per match for the playing xi
team1_playingxi_ratings = player_rankings_2 %>% filter(Player %in% team1_playingxi)
team2_playingxi_ratings = player_rankings_2 %>% filter(Player %in% team2_playingxi)
#multiply the ground buffs
#weight it by batter rating, ie if the batter rating is higher, the ground buffs will have more impact
team1_playingxi_ratings$BatImpactperGame_2 = 0
team2_playingxi_ratings$BatImpactperGame_2 = 0
team1_playingxi_ratings$BowlImpactperGame_2 = 0
team2_playingxi_ratings$BowlImpactperGame_2 = 0
for(i in 1:nrow(team1_playingxi_ratings)){
#sort by batting rating
team1_playingxi_ratings = team1_playingxi_ratings %>% arrange(desc(BatImpactperGame))
#divide by appropriate factor, assume only 8 batsmen will play
if(i < 9){
team1_avg = team1_playingxi_ratings$BatImpactperGame[i] * (1 / median(venue_factors$BattingScale2)) * ground_buffs$BattingScale2[1] * ((9-i)/(10-i))
team1_playingxi_ratings$BatImpactperGame_2[i] = team1_avg
}
else {
#Bottom batsmen don't get a buff and don't play as much
team1_avg = 0
#team1_playingxi_ratings$BatImpactperGame_2[i] = team1_avg
}
team1_playingxi_ratings = team1_playingxi_ratings %>% arrange(desc(BowlImpactperGame))
if(i < 7) {
team1_avg = team1_playingxi_ratings$BowlImpactperGame[i] * (1 / median(venue_factors$BowlingScale2)) * ground_buffs$BowlingScale2[1] * ((8-i)/(9-i))
team1_playingxi_ratings$BowlImpactperGame_2[i] = team1_avg
#team1_playingxi_ratings$BowlImpactperGame_2[i] = team1_avg
}
else {
#Bottom bowlers don't bowl in the game
team1_playingxi_ratings$BowlImpactperGame_2[i] = 0
}
}
for(i in 1:nrow(team2_playingxi_ratings)){
team2_playingxi_ratings = team2_playingxi_ratings %>% arrange(desc(BatImpactperGame))
#divide by appropriate factor, assume only 8 batsmen will play
if(i < 9){
team2_avg = team2_playingxi_ratings$BatImpactperGame[i] * (1 / median(venue_factors$BattingScale2)) * ground_buffs$BattingScale2[1] * ((9-i)/(10-i))
team2_playingxi_ratings$BatImpactperGame_2[i] = team2_avg
}
else{
#Bottom batsmen don't get a buff and don't play as much
team2_avg = 0
}
team2_playingxi_ratings = team2_playingxi_ratings %>% arrange(desc(BowlImpactperGame))
if(i < 7){
team2_avg = team2_playingxi_ratings$BowlImpactperGame[i] * (1 / median(venue_factors$BowlingScale2)) * ground_buffs$BowlingScale2[1] * ((8-i)/(9-i))
team2_playingxi_ratings$BowlImpactperGame_2[i] = team2_avg
}
else{
#Bottom bowlers don't bowl in the game
team2_playingxi_ratings$BowlImpactperGame_2[i] = 0
}
}
#arrange batting ranking in descending order
for(j in 1:(ncol(playertotalimpact) / 2)){
if(j < 9){
team1_playingxi_ratings = team1_playingxi_ratings %>% arrange(desc(BatImpactperGame))
playertotalimpact[counter,j] = team1_playingxi_ratings$BatImpactperGame_2[j]
}
else{
team1_playingxi_ratings = team1_playingxi_ratings %>% arrange(desc(BowlImpactperGame))
playertotalimpact[counter,j] = team1_playingxi_ratings$BowlImpactperGame_2[j-8]
}
}
for(j in 1:(ncol(playertotalimpact) / 2)){
if(j < 9){
team2_playingxi_ratings = team2_playingxi_ratings %>% arrange(desc(BatImpactperGame))
playertotalimpact[counter,(j + 14)] = team2_playingxi_ratings$BatImpactperGame_2[j]
}
else{
team2_playingxi_ratings = team2_playingxi_ratings %>% arrange(desc(BowlImpactperGame))
playertotalimpact[counter,(j + 14)] = team2_playingxi_ratings$BowlImpactperGame_2[j-8]
}
}
playertotalimpact[counter, 29] = id
counter = counter + 1
}Join by id to match_data_cricinfo$Winner_Team1
Remove all columns from match_data_cricinfo except Winner_Team1
playertotalimpact2 = playertotalimpact2 %>% dplyr::select(-c(30:44))
playertotalimpact2 = playertotalimpact2 %>% dplyr::select(-c(29))Find nas
Any cell with gets row removed
4.2 Heuristic: If Team 1 has greater total impact, predict Team 1 wins, else Team 2 wins
predcorrectcount = 0
team1vec = numeric(nrow(playertotalimpact2))
team2vec = numeric(nrow(playertotalimpact2))
nummatches = nrow(playertotalimpact2)
for(i in 1:nrow(playertotalimpact2)){
#sum rows one through 17
team1_total = sum(playertotalimpact2[i, 1:14])
team2_total = sum(playertotalimpact2[i, 15:28])
team1vec[i] = team1_total
team2vec[i] = team2_total
if(team1_total > team2_total){
predicted_winner = 1
}
else{
predicted_winner = 0
}
if(predicted_winner == as.numeric(playertotalimpact2$Winner_Team1[i])){
predcorrectcount = predcorrectcount + 1
}
}
accuracy = predcorrectcount / nummatches
print(paste("Heuristic Accuracy on all matches:", round(accuracy * 100, 2), "%"))## [1] "Heuristic Accuracy on all matches: 65.98 %"
4.3 Logistic Regression with repeated splits
target = factor(playertotalimpact2$Winner_Team1, levels = c(0, 1), labels = c("Team2", "Team1"))
accvector_lr = numeric(30) # store accuracies
rmsevector_lr = numeric(30) # store rmse
loglossvector_lr = numeric(30) # store log loss
compvector_lr = numeric(30) # store composite score
set.seed(314)
overallbestlambda = 0
overallbestalpha = 0
best_composite_lr = Inf # initialize to large value
for(i in 1:30){
# Train-test split (80-20)
train_index <- createDataPartition(playertotalimpact2$Winner_Team1, p = 0.8, list = FALSE)
train_data <- playertotalimpact2[train_index, ]
test_data <- playertotalimpact2[-train_index, ]
# Define Predictors and Target
train_x <- as.matrix(train_data[, 1:28])
train_y <- as.factor(train_data$Winner_Team1) # 0/1
test_x <- as.matrix(test_data[, 1:28])
test_y <- as.factor(test_data$Winner_Team1) # 0/1
# Hyperparameter grid search
grid <- expand.grid(alpha = seq(0, 1, by = 0.1), lambda = seq(0.001, 0.1, by = 0.001))
logistic_model <- train(
x = train_x,
y = train_y,
method = "glmnet",
trControl = trainControl(method = "cv", number = 5),
family = "binomial",
metric = "Accuracy",
tuneGrid = grid
)
best_lambda <- logistic_model$bestTune$lambda
best_alpha <- logistic_model$bestTune$alpha
print(paste("Best Lambda:", round(best_lambda, 4), "Best Alpha:", round(best_alpha, 4)))
# Train logistic model with best parameters
best_log_model <- glmnet(
x = train_x,
y = as.numeric(train_y) - 1,
family = "binomial",
alpha = best_alpha,
lambda = best_lambda
)
# Get test set probabilities
test_prob <- predict(best_log_model, newx = test_x, type = "response", s = best_lambda)
# Apply threshold to test predictions
test_predictions <- factor(ifelse(test_prob > 0.5, 1, 0),
levels = c(0, 1),
labels = c("Team2", "Team1"))
# Compute metrics
test_rmse = sqrt(mean((as.numeric(test_y) - 1 - (as.numeric(test_prob)))^2))
test_logloss = -mean(ifelse(test_y == "1", log(test_prob), log(1 - test_prob)))
test_y <- factor(test_y, levels = c(0, 1), labels = c("Team2", "Team1"))
conf_matrix <- confusionMatrix(test_predictions, test_y)
test_acc = conf_matrix$overall['Accuracy']
# Store metrics
accvector_lr[i] <- test_acc
rmsevector_lr[i] <- test_rmse
loglossvector_lr[i] <- test_logloss
# Composite score
compvector_lr[i] <- (1 - test_acc) + test_rmse + test_logloss
print(paste("Iter", i,
"- Accuracy:", round(test_acc, 4),
"RMSE:", round(test_rmse, 4),
"LogLoss:", round(test_logloss, 4),
"Composite:", round(compvector_lr[i], 4)))
# Update best model by composite score
if(compvector_lr[i] < best_composite_lr){
best_composite_lr = compvector_lr[i]
best_acc_lr = accvector_lr[i]
best_rmse_lr = rmsevector_lr[i]
best_logloss_lr = loglossvector_lr[i]
overallbestlambda = best_lambda
overallbestalpha = best_alpha
}
}## [1] "Best Lambda: 0.044 Best Alpha: 0.2"
## [1] "Iter 1 - Accuracy: 0.6604 RMSE: 0.4734 LogLoss: 0.6394 Composite: 1.4525"
## [1] "Best Lambda: 0.1 Best Alpha: 0"
## [1] "Iter 2 - Accuracy: 0.6792 RMSE: 0.458 LogLoss: 0.6089 Composite: 1.3877"
## [1] "Best Lambda: 0.057 Best Alpha: 0"
## [1] "Iter 3 - Accuracy: 0.6509 RMSE: 0.4636 LogLoss: 0.6177 Composite: 1.4304"
## [1] "Best Lambda: 0.084 Best Alpha: 0.1"
## [1] "Iter 4 - Accuracy: 0.6415 RMSE: 0.4754 LogLoss: 0.6433 Composite: 1.4772"
## [1] "Best Lambda: 0.01 Best Alpha: 0.3"
## [1] "Iter 5 - Accuracy: 0.6887 RMSE: 0.449 LogLoss: 0.5898 Composite: 1.3501"
## [1] "Best Lambda: 0.008 Best Alpha: 1"
## [1] "Iter 6 - Accuracy: 0.6462 RMSE: 0.4655 LogLoss: 0.6224 Composite: 1.4417"
## [1] "Best Lambda: 0.055 Best Alpha: 0"
## [1] "Iter 7 - Accuracy: 0.6981 RMSE: 0.4564 LogLoss: 0.6064 Composite: 1.3648"
## [1] "Best Lambda: 0.068 Best Alpha: 0.1"
## [1] "Iter 8 - Accuracy: 0.684 RMSE: 0.4641 LogLoss: 0.6201 Composite: 1.4002"
## [1] "Best Lambda: 0.093 Best Alpha: 0.1"
## [1] "Iter 9 - Accuracy: 0.6887 RMSE: 0.4581 LogLoss: 0.6081 Composite: 1.3775"
## [1] "Best Lambda: 0.066 Best Alpha: 0.1"
## [1] "Iter 10 - Accuracy: 0.6132 RMSE: 0.472 LogLoss: 0.6357 Composite: 1.4945"
## [1] "Best Lambda: 0.1 Best Alpha: 0"
## [1] "Iter 11 - Accuracy: 0.6132 RMSE: 0.4858 LogLoss: 0.6666 Composite: 1.5391"
## [1] "Best Lambda: 0.013 Best Alpha: 0"
## [1] "Iter 12 - Accuracy: 0.684 RMSE: 0.4659 LogLoss: 0.6269 Composite: 1.4089"
## [1] "Best Lambda: 0.008 Best Alpha: 0.8"
## [1] "Iter 13 - Accuracy: 0.6132 RMSE: 0.4674 LogLoss: 0.6258 Composite: 1.48"
## [1] "Best Lambda: 0.052 Best Alpha: 0"
## [1] "Iter 14 - Accuracy: 0.6509 RMSE: 0.4685 LogLoss: 0.6278 Composite: 1.4453"
## [1] "Best Lambda: 0.089 Best Alpha: 0.3"
## [1] "Iter 15 - Accuracy: 0.6462 RMSE: 0.4796 LogLoss: 0.652 Composite: 1.4854"
## [1] "Best Lambda: 0.017 Best Alpha: 0.3"
## [1] "Iter 16 - Accuracy: 0.6792 RMSE: 0.4614 LogLoss: 0.6156 Composite: 1.3977"
## [1] "Best Lambda: 0.058 Best Alpha: 0.3"
## [1] "Iter 17 - Accuracy: 0.6651 RMSE: 0.4667 LogLoss: 0.6266 Composite: 1.4283"
## [1] "Best Lambda: 0.044 Best Alpha: 0"
## [1] "Iter 18 - Accuracy: 0.6415 RMSE: 0.4685 LogLoss: 0.6303 Composite: 1.4573"
## [1] "Best Lambda: 0.003 Best Alpha: 0.2"
## [1] "Iter 19 - Accuracy: 0.6698 RMSE: 0.4682 LogLoss: 0.6285 Composite: 1.4269"
## [1] "Best Lambda: 0.055 Best Alpha: 0.1"
## [1] "Iter 20 - Accuracy: 0.717 RMSE: 0.4474 LogLoss: 0.5876 Composite: 1.3181"
## [1] "Best Lambda: 0.1 Best Alpha: 0"
## [1] "Iter 21 - Accuracy: 0.6698 RMSE: 0.47 LogLoss: 0.6323 Composite: 1.4325"
## [1] "Best Lambda: 0.07 Best Alpha: 0"
## [1] "Iter 22 - Accuracy: 0.6557 RMSE: 0.4708 LogLoss: 0.6345 Composite: 1.4496"
## [1] "Best Lambda: 0.074 Best Alpha: 0"
## [1] "Iter 23 - Accuracy: 0.6415 RMSE: 0.4779 LogLoss: 0.6474 Composite: 1.4838"
## [1] "Best Lambda: 0.058 Best Alpha: 0"
## [1] "Iter 24 - Accuracy: 0.6934 RMSE: 0.4497 LogLoss: 0.5916 Composite: 1.3479"
## [1] "Best Lambda: 0.1 Best Alpha: 0"
## [1] "Iter 25 - Accuracy: 0.6321 RMSE: 0.4674 LogLoss: 0.6276 Composite: 1.463"
## [1] "Best Lambda: 0.086 Best Alpha: 0"
## [1] "Iter 26 - Accuracy: 0.6651 RMSE: 0.4629 LogLoss: 0.6201 Composite: 1.4179"
## [1] "Best Lambda: 0.076 Best Alpha: 0"
## [1] "Iter 27 - Accuracy: 0.684 RMSE: 0.4496 LogLoss: 0.59 Composite: 1.3557"
## [1] "Best Lambda: 0.1 Best Alpha: 0"
## [1] "Iter 28 - Accuracy: 0.6368 RMSE: 0.4738 LogLoss: 0.6398 Composite: 1.4767"
## [1] "Best Lambda: 0.076 Best Alpha: 0"
## [1] "Iter 29 - Accuracy: 0.6226 RMSE: 0.4693 LogLoss: 0.6272 Composite: 1.4739"
## [1] "Best Lambda: 0.091 Best Alpha: 0.2"
## [1] "Iter 30 - Accuracy: 0.6651 RMSE: 0.4702 LogLoss: 0.6338 Composite: 1.4389"
print(paste("Best Lambda:", round(overallbestlambda, 4),
"Best Alpha:", round(overallbestalpha, 4),
"with Composite Score:", round(best_composite_lr, 4),
"Accuracy:", round(best_acc_lr, 4),
"RMSE:", round(best_rmse_lr, 4),
"LogLoss:", round(best_logloss_lr, 4)))## [1] "Best Lambda: 0.055 Best Alpha: 0.1 with Composite Score: 1.3181 Accuracy: 0.717 RMSE: 0.4474 LogLoss: 0.5876"
Train the final Logistic Regression model on the entire dataset
set.seed(314) # For reproducibility
train_index <- createDataPartition(target, p = 0.8, list = FALSE)
train_data <- playertotalimpact2[train_index, ]
test_data <- playertotalimpact2[-train_index, ]Define Predictors and Target for the final model
predictors <- as.matrix(train_data[, 1:28]) # Player impact features
target <- as.factor(train_data$Winner_Team1) # Target variable (0 or 1)Hyperparameter for final model selected based on grid search
final_model = glmnet(
x = playertotalimpact2[, 1:28], # Player impact features,
y = playertotalimpact2[,29], # Convert factor (0/1) to numeric
family = "binomial",
alpha = overallbestalpha, # Use the best alpha from previous model
lambda = overallbestlambda, # Use the best lambda from previous model
)
print(paste("Accuracy value on test set: ", round(best_acc_lr * 100, 2), "%"))## [1] "Accuracy value on test set: 71.7 %"
4.4 RF model
Train-test split (80-20)
set.seed(314) # For reproducibility
# Store RF results in their own vectors
accvec_rf = numeric(30) # accuracy vector
rmsevec_rf = numeric(30) # rmse vector
loglossvec_rf = numeric(30) # log loss vector
compvec_rf = numeric(30) # composite score vector
best_composite_rf = Inf
overallbestmtry_rf = 0
overallbestntree_rf = 500
set.seed(314)
for(i in 1:30) { # Repeat for 30 iterations
# Train-test split (80-20)
train_index_rf <- createDataPartition(playertotalimpact2$Winner_Team1, p = 0.8, list = FALSE)
train_data_rf <- playertotalimpact2[train_index_rf, ]
test_data_rf <- playertotalimpact2[-train_index_rf, ]
# Define Predictors and Target
train_x_rf <- train_data_rf[, 1:28]
train_y_rf <- as.factor(train_data_rf$Winner_Team1)
test_x_rf <- test_data_rf[, 1:28]
test_y_rf <- as.factor(test_data_rf$Winner_Team1)
# Train RF with caret::train (5-fold CV, tuning mtry automatically)
rf_model <- train(
x = train_x_rf,
y = train_y_rf,
method = "rf",
trControl = trainControl(method = "cv", number = 5),
tuneLength = 5,
ntree = 500,
importance = TRUE,
metric = "Accuracy"
)
# Best tuning param
best_mtry_rf <- rf_model$bestTune$mtry
# Predictions (class + probs)
rf_predictions <- predict(rf_model, newdata = test_x_rf)
rf_prob <- predict(rf_model, newdata = test_x_rf, type = "prob")[,2]
# Accuracy
rf_conf_matrix <- confusionMatrix(rf_predictions, test_y_rf)
rf_accuracy <- rf_conf_matrix$overall['Accuracy']
# RMSE (probability vs actual outcome)
rf_rmse <- sqrt(mean((as.numeric(test_y_rf) - 1 - rf_prob)^2))
# Log Loss
rf_logloss <- -mean(ifelse(test_y_rf == "1", log(rf_prob), log(1 - rf_prob)))
# Composite Score
rf_composite <- (1 - rf_accuracy) + rf_rmse + rf_logloss
# Store metrics
accvec_rf[i] <- rf_accuracy
rmsevec_rf[i] <- rf_rmse
loglossvec_rf[i] <- rf_logloss
compvec_rf[i] <- rf_composite
# Print metrics
print(paste("Iter", i,
"- Acc:", round(rf_accuracy, 4),
"RMSE:", round(rf_rmse, 4),
"LogLoss:", round(rf_logloss, 4),
"Composite:", round(rf_composite, 4)))
# Track best composite
if(rf_composite < best_composite_rf){
best_composite_rf = rf_composite
best_acc_rf = accvec_rf[i]
best_rmse_rf = rmsevec_rf[i]
best_logloss_rf = loglossvec_rf[i]
overallbestmtry_rf = best_mtry_rf
}
}## [1] "Iter 1 - Acc: 0.6651 RMSE: 0.4714 LogLoss: 0.6407 Composite: 1.447"
## [1] "Iter 2 - Acc: 0.6462 RMSE: 0.4909 LogLoss: 0.6891 Composite: 1.5338"
## [1] "Iter 3 - Acc: 0.6085 RMSE: 0.4783 LogLoss: 0.6516 Composite: 1.5214"
## [1] "Iter 4 - Acc: 0.6085 RMSE: 0.4993 LogLoss: 0.7004 Composite: 1.5912"
## [1] "Iter 5 - Acc: 0.6651 RMSE: 0.476 LogLoss: 0.6493 Composite: 1.4602"
## [1] "Iter 6 - Acc: 0.6038 RMSE: 0.4832 LogLoss: 0.6624 Composite: 1.5418"
## [1] "Iter 7 - Acc: 0.6792 RMSE: 0.4642 LogLoss: 0.6227 Composite: 1.4077"
## [1] "Iter 8 - Acc: 0.6557 RMSE: 0.4694 LogLoss: 0.6295 Composite: 1.4432"
## [1] "Iter 9 - Acc: 0.6745 RMSE: 0.4757 LogLoss: 0.6466 Composite: 1.4478"
## [1] "Iter 10 - Acc: 0.6509 RMSE: 0.4697 LogLoss: 0.6294 Composite: 1.4481"
## [1] "Iter 11 - Acc: 0.6415 RMSE: 0.4819 LogLoss: 0.6668 Composite: 1.5072"
## [1] "Iter 12 - Acc: 0.6934 RMSE: 0.4572 LogLoss: 0.6074 Composite: 1.3712"
## [1] "Iter 13 - Acc: 0.6745 RMSE: 0.4583 LogLoss: 0.6082 Composite: 1.3919"
## [1] "Iter 14 - Acc: 0.684 RMSE: 0.4519 LogLoss: 0.5959 Composite: 1.3639"
## [1] "Iter 15 - Acc: 0.6557 RMSE: 0.4775 LogLoss: 0.6523 Composite: 1.4742"
## [1] "Iter 16 - Acc: 0.566 RMSE: 0.5043 LogLoss: 0.7104 Composite: 1.6487"
## [1] "Iter 17 - Acc: 0.5849 RMSE: 0.503 LogLoss: 0.7121 Composite: 1.6302"
## [1] "Iter 18 - Acc: 0.6415 RMSE: 0.4658 LogLoss: 0.624 Composite: 1.4483"
## [1] "Iter 19 - Acc: 0.6698 RMSE: 0.4852 LogLoss: 0.6812 Composite: 1.4966"
## [1] "Iter 20 - Acc: 0.6132 RMSE: 0.4916 LogLoss: 0.6813 Composite: 1.5597"
## [1] "Iter 21 - Acc: 0.6651 RMSE: 0.4708 LogLoss: 0.6353 Composite: 1.441"
## [1] "Iter 22 - Acc: 0.6557 RMSE: 0.4723 LogLoss: 0.6393 Composite: 1.4559"
## [1] "Iter 23 - Acc: 0.7028 RMSE: 0.4657 LogLoss: 0.6277 Composite: 1.3905"
## [1] "Iter 24 - Acc: 0.6557 RMSE: 0.4774 LogLoss: 0.6521 Composite: 1.4738"
## [1] "Iter 25 - Acc: 0.6321 RMSE: 0.4661 LogLoss: 0.623 Composite: 1.4571"
## [1] "Iter 26 - Acc: 0.6604 RMSE: 0.477 LogLoss: 0.6523 Composite: 1.4689"
## [1] "Iter 27 - Acc: 0.6557 RMSE: 0.474 LogLoss: 0.6458 Composite: 1.4641"
## [1] "Iter 28 - Acc: 0.6651 RMSE: 0.479 LogLoss: 0.6562 Composite: 1.4701"
## [1] "Iter 29 - Acc: 0.7028 RMSE: 0.4702 LogLoss: 0.6399 Composite: 1.4073"
## [1] "Iter 30 - Acc: 0.6368 RMSE: 0.472 LogLoss: 0.6393 Composite: 1.4746"
print(paste("Best RF model - mtry:", overallbestmtry_rf,
"ntree:", overallbestntree_rf,
"with Composite Score:", round(best_composite_rf, 4),
" Accuracy:", round(best_acc_rf, 4),
" RMSE:", round(best_rmse_rf, 4),
" LogLoss:", round(best_logloss_rf, 4)))## [1] "Best RF model - mtry: 2 ntree: 500 with Composite Score: 1.3639 Accuracy: 0.684 RMSE: 0.4519 LogLoss: 0.5959"
4.5 SVMs
# Store SVM results in their own vectors
accvec_svm = numeric(30) # accuracy vector
rmsevec_svm = numeric(30) # rmse vector
loglossvec_svm = numeric(30) # log loss vector
compvec_svm = numeric(30) # composite score vector
best_composite_svm = Inf
overallbestC_svm = NA
set.seed(314)
for(i in 1:30) { # Repeat for 30 iterations
# Train-test split (80-20)
train_index_svm <- createDataPartition(playertotalimpact2$Winner_Team1, p = 0.8, list = FALSE)
train_data_svm <- playertotalimpact2[train_index_svm, ]
test_data_svm <- playertotalimpact2[-train_index_svm, ]
# Define Predictors and Target
train_x_svm <- train_data_svm[, 1:28]
train_y_svm <- as.factor(train_data_svm$Winner_Team1)
test_x_svm <- test_data_svm[, 1:28]
test_y_svm <- as.factor(test_data_svm$Winner_Team1)
# Hyperparameter grid for SVM (linear kernel)
grid_svm <- expand.grid(C = 2^(-5:5))
# Train SVM with caret::train
svm_model <- train(
x = train_x_svm,
y = train_y_svm,
method = "svmLinear",
trControl = trainControl(method = "cv", number = 5),
tunegrid = grid_svm,
prob.model = TRUE
)
# Best C
best_C_svm <- svm_model$bestTune$C
# Predictions
svm_predictions <- predict(svm_model, newdata = test_x_svm)
svm_prob <- predict(svm_model, newdata = test_x_svm, type = "prob")[,2]
# Accuracy
svm_conf_matrix <- confusionMatrix(svm_predictions, test_y_svm)
svm_accuracy <- svm_conf_matrix$overall['Accuracy']
# RMSE (probability vs actual outcome)
svm_rmse <- sqrt(mean((as.numeric(test_y_svm) - 1 - svm_prob)^2))
# Log Loss
svm_logloss <- -mean(ifelse(test_y_svm == "1", log(svm_prob), log(1 - svm_prob)))
# Composite Score (lower is better)
svm_composite <- (1 - svm_accuracy) + svm_rmse + svm_logloss
# Store metrics
accvec_svm[i] <- svm_accuracy
rmsevec_svm[i] <- svm_rmse
loglossvec_svm[i] <- svm_logloss
compvec_svm[i] <- svm_composite
# Print metrics
print(paste("Iter", i,
"- Acc:", round(svm_accuracy, 4),
"RMSE:", round(svm_rmse, 4),
"LogLoss:", round(svm_logloss, 4),
"Composite:", round(svm_composite, 4)))
# Track best composite
if(svm_composite < best_composite_svm){
best_composite_svm = svm_composite
best_acc_svm = accvec_svm[i]
best_rmse_svm = rmsevec_svm[i]
best_logloss_svm = loglossvec_svm[i]
overallbestC_svm = best_C_svm
}
}## [1] "Iter 1 - Acc: 0.6415 RMSE: 0.4835 LogLoss: 0.6609 Composite: 1.5029"
## [1] "Iter 2 - Acc: 0.6745 RMSE: 0.4647 LogLoss: 0.6222 Composite: 1.4123"
## [1] "Iter 3 - Acc: 0.6698 RMSE: 0.4631 LogLoss: 0.6184 Composite: 1.4117"
## [1] "Iter 4 - Acc: 0.6604 RMSE: 0.4719 LogLoss: 0.6371 Composite: 1.4487"
## [1] "Iter 5 - Acc: 0.6462 RMSE: 0.4679 LogLoss: 0.6283 Composite: 1.4499"
## [1] "Iter 6 - Acc: 0.6604 RMSE: 0.4687 LogLoss: 0.6302 Composite: 1.4385"
## [1] "Iter 7 - Acc: 0.6226 RMSE: 0.4811 LogLoss: 0.6562 Composite: 1.5147"
## [1] "Iter 8 - Acc: 0.6604 RMSE: 0.4708 LogLoss: 0.6367 Composite: 1.4471"
## [1] "Iter 9 - Acc: 0.6368 RMSE: 0.4761 LogLoss: 0.6452 Composite: 1.4845"
## [1] "Iter 10 - Acc: 0.6651 RMSE: 0.4751 LogLoss: 0.6441 Composite: 1.4541"
## [1] "Iter 11 - Acc: 0.6415 RMSE: 0.4813 LogLoss: 0.6582 Composite: 1.498"
## [1] "Iter 12 - Acc: 0.6462 RMSE: 0.4736 LogLoss: 0.6395 Composite: 1.4669"
## [1] "Iter 13 - Acc: 0.6038 RMSE: 0.4776 LogLoss: 0.6471 Composite: 1.5209"
## [1] "Iter 14 - Acc: 0.6415 RMSE: 0.4827 LogLoss: 0.6587 Composite: 1.4999"
## [1] "Iter 15 - Acc: 0.684 RMSE: 0.4649 LogLoss: 0.6234 Composite: 1.4043"
## [1] "Iter 16 - Acc: 0.6792 RMSE: 0.4576 LogLoss: 0.6074 Composite: 1.3858"
## [1] "Iter 17 - Acc: 0.6509 RMSE: 0.4614 LogLoss: 0.6141 Composite: 1.4246"
## [1] "Iter 18 - Acc: 0.717 RMSE: 0.4549 LogLoss: 0.603 Composite: 1.341"
## [1] "Iter 19 - Acc: 0.6368 RMSE: 0.4759 LogLoss: 0.645 Composite: 1.4841"
## [1] "Iter 20 - Acc: 0.6132 RMSE: 0.482 LogLoss: 0.6576 Composite: 1.5264"
## [1] "Iter 21 - Acc: 0.6415 RMSE: 0.4784 LogLoss: 0.6501 Composite: 1.4869"
## [1] "Iter 22 - Acc: 0.5943 RMSE: 0.4835 LogLoss: 0.6601 Composite: 1.5492"
## [1] "Iter 23 - Acc: 0.6179 RMSE: 0.4787 LogLoss: 0.6536 Composite: 1.5144"
## [1] "Iter 24 - Acc: 0.6368 RMSE: 0.487 LogLoss: 0.6714 Composite: 1.5217"
## [1] "Iter 25 - Acc: 0.7028 RMSE: 0.4441 LogLoss: 0.5813 Composite: 1.3226"
## [1] "Iter 26 - Acc: 0.6321 RMSE: 0.4711 LogLoss: 0.6337 Composite: 1.4727"
## [1] "Iter 27 - Acc: 0.6745 RMSE: 0.4683 LogLoss: 0.6302 Composite: 1.424"
## [1] "Iter 28 - Acc: 0.7075 RMSE: 0.4511 LogLoss: 0.5962 Composite: 1.3398"
## [1] "Iter 29 - Acc: 0.6792 RMSE: 0.4639 LogLoss: 0.6227 Composite: 1.4073"
## [1] "Iter 30 - Acc: 0.6509 RMSE: 0.4729 LogLoss: 0.6371 Composite: 1.459"
print(paste("Best SVM model - C:", overallbestC_svm,
"with Composite Score:", round(best_composite_svm, 4),
" Accuracy:", round(best_acc_svm, 4),
" RMSE:", round(best_rmse_svm, 4),
" LogLoss:", round(best_logloss_svm, 4)))## [1] "Best SVM model - C: 1 with Composite Score: 1.3226 Accuracy: 0.7028 RMSE: 0.4441 LogLoss: 0.5813"
4.6 Model Comparison
model_scores <- data.frame(
Model = c("Heuristic", "Logistic Regression", "Random Forest", "SVM"),
Accuracy = c(round(accuracy, 4), round(best_acc_lr, 4), round(best_acc_rf, 4), round(best_acc_svm, 4)),
RMSE = c(NA, round(best_rmse_lr, 4), round(best_rmse_rf, 4), round(best_rmse_svm, 4)),
LogLoss = c(NA, round(best_logloss_lr, 4), round(best_logloss_rf, 4), round(best_logloss_svm, 4)),
CompositeScore = c(NA, round(best_composite_lr, 4), round(best_composite_rf, 4), round(best_composite_svm, 4)),
Conf_Interval = c(
NA,
paste0("(", round(best_composite_lr - 1.96 * sd(compvector_lr) / sqrt(30), 4), ", ",
round(best_composite_lr + 1.96 * sd(compvector_lr) / sqrt(30), 4), ")"),
paste0("(", round(best_composite_rf - 1.96 * sd(compvec_rf) / sqrt(30), 4), ", ",
round(best_composite_rf + 1.96 * sd(compvec_rf) / sqrt(30), 4), ")"),
paste0("(", round(best_composite_svm - 1.96 * sd(compvec_svm) / sqrt(30), 4), ", ",
round(best_composite_svm + 1.96 * sd(compvec_svm) / sqrt(30), 4), ")")
)
)
# Print with nice column labels
(setNames(model_scores, c(
"Model",
"Accuracy",
"RMSE",
"Log-Loss",
"Composite Score",
"95% Confidence Interval"
)))## Model Accuracy RMSE Log-Loss Composite Score
## 1 Heuristic 0.6598 NA NA NA
## 2 Logistic Regression 0.7170 0.4474 0.5876 1.3181
## 3 Random Forest 0.6840 0.4519 0.5959 1.3639
## 4 SVM 0.7028 0.4441 0.5813 1.3226
## 95% Confidence Interval
## 1 <NA>
## 2 (1.2997, 1.3364)
## 3 (1.3391, 1.3887)
## 4 (1.3017, 1.3434)