2 Examining and Cleaning the Data
We begin with a preliminary examination of the data we will be working with:
= read.csv("Gamelog T20I.csv")
gamelog dim(gamelog)
## [1] 168966 21
= length(unique(gamelog$MatchNo))
matches print(matches)
## [1] 688
head(gamelog, n=3)
## Format MatchNo TeamBowling TeamBatting Inning Over Ball Bowler BowlerID Batsman BatsmanID Fielder FielderID Outcome
## 1 T20I 33 AUS BD 1 0 1 B Lee 17 Tamim Iqbal 1041 NA no
## 2 T20I 33 AUS BD 1 0 2 B Lee 17 Tamim Iqbal 1041 NA no
## 3 T20I 33 AUS BD 1 0 3 B Lee 17 Tamim Iqbal 1041 NA no
## NumOutcome BallType NumBallType Notes
## 1 0 run 0 good
## 2 0 run 0 short
## 3 0 run 0 short
## FullNotes IDflag Wickets
## 1 good start by Lee dug in short of a length outside off, gives Tamim a lifter, left alone 0 0
## 2 short of a length outside off again this time Tamim gets some bat on it and pushes to point 0 0
## 3 short again and aimed at the body Tamim gets on the backfoot and defends to the off side 0 0
summary(subset(gamelog,select=c(MatchNo,Inning,Over,Ball,Outcome,NumOutcome,IDflag,Wickets)))
## MatchNo Inning Over Ball Outcome NumOutcome IDflag
## Min. : 1 Min. :1.000 Min. : 0.000 Min. :1.000 Length:168966 Min. : -1.000 Min. :0.00000
## 1st Qu.: 244 1st Qu.:1.000 1st Qu.: 4.000 1st Qu.:2.000 Class :character 1st Qu.: 0.000 1st Qu.:0.00000
## Median :200938 Median :1.000 Median : 9.000 Median :3.000 Mode :character Median : 1.000 Median :0.00000
## Mean :112498 Mean :1.455 Mean : 9.127 Mean :3.488 Mean : 1.238 Mean :0.09382
## 3rd Qu.:201316 3rd Qu.:2.000 3rd Qu.:14.000 3rd Qu.:5.000 3rd Qu.: 1.000 3rd Qu.:0.00000
## Max. :201560 Max. :2.000 Max. :19.000 Max. :6.000 Max. :2015.000 Max. :3.00000
## NA's :16
## Wickets
## Min. : 0.000
## 1st Qu.: 1.000
## Median : 2.000
## Mean : 2.553
## 3rd Qu.: 4.000
## Max. :24.000
##
We see that there are a total of 168,966 balls thrown over 688 unique games. We convert several character variables to factors where it makes sense to do so:
$Format = factor(gamelog$Format)
gameloglevels(gamelog$Format)
## [1] " " "IPL" "T20I"
$Outcome = factor(gamelog$Outcome)
gameloglevels(gamelog$Outcome)
## [1] "" "1" "2" "2015" "3" "4" "5" "6" "7" "FOUR" "no" "OUT" "SIX"
$BallType= factor(gamelog$BallType) gamelog
According to the rules of Twenty20 Cricket, there can only be up to 9 wickets in an innings, but we see above that there are some balls in which the wicket is higher than 9, indeed up to 24. This warrants some inspection, and gives rise to a host of issues with the data, including:
- wickets accumulating further rather than resetting to 0 at the start of the 2nd innings
- some matches not labeling the second innings (i.e. all observations are labelled as innings 1)
- some matches in which the batting and bowling teams did not switch when the innings transitioned from first to second
- duplicate matches and innings
- multiple matches having the same
MatchNo
(match 200901, which corresponds to the rows having empty format) - Strange values for
NumOutcome
variable (i.e. 2015)
We tackle all these problems with the following code:
library(dplyr)
= distinct(gamelog) # Remove duplicates
gamelog
for (matchID in unique(gamelog$MatchNo[gamelog$Wickets>9])) {
# Check if the dataset missed the transition from first to second inning
if (nrow(unique(subset(gamelog,MatchNo==matchID,Inning))) == 1) {
# 2 possibilities:
# 1. The first inning ended after 20 overs
if (nrow(subset(gamelog,MatchNo==matchID & Over==19 & Ball==6 & Wickets < 10)) > 0) {
#Number of wickets on last ball of first innings
= as.numeric(subset(gamelog,MatchNo==matchID & Over==19 & Ball==6 & Wickets < 10 &
wickets !NumBallType %in% c(1,2),Wickets)[1])
#Properly label 2nd innings
$Inning[gamelog$MatchNo==matchID & gamelog$Wickets > wickets] = 2
gamelog
#Properly label 2nd innings
$Inning[gamelog$MatchNo==matchID & gamelog$Wickets==wickets &
gamelog$Over < 10] = 2
gamelog
#Reset wickets
$Wickets[gamelog$Inning==2 & gamelog$MatchNo==matchID] =
gamelog$Wickets[gamelog$Inning==2 & gamelog$MatchNo==matchID] - wickets
gamelog
}
#2. The first inning ended after 10 wickets
else if (nrow(subset(gamelog,Wickets==9 & NumOutcome==-1))>0) {
#Properly label 2nd innings
$Inning[gamelog$MatchNo==matchID & gamelog$Wickets > 9] = 2
gamelog
#Reset wickets
$Wickets[gamelog$MatchNo==matchID & gamelog$Inning==2] =
gamelog$Wickets[gamelog$MatchNo==matchID & gamelog$Inning==2] - 10
gamelog
}
}
}
#Filter out random rows with nonsensical data
= subset(gamelog,!(MatchNo %in% c(200943,201149,201212,201323,201356,201558) & Wickets==10))
gamelog
# Filter out duplicate matches and innings based on number of plays which have
#the exact same commentary
<- gamelog %>% dplyr::filter(duplicated(FullNotes) & FullNotes != "") %>%
dups ::group_by(MatchNo) %>% dplyr::summarise(dups = length(FullNotes))
dplyr<- gamelog %>% dplyr::group_by(MatchNo) %>% dplyr::summarise(plays = length(FullNotes))
agg <- merge(dups,agg)
check <- unique(check %>% filter(plays - dups < 10) %>% pull(MatchNo))
dup_matches
<- subset(gamelog,!MatchNo %in% dup_matches) #Remove duplicate matches
gamelog
%>% dplyr::filter(!(MatchNo %in% dup_matches) & dups > 50)
dups
unique(subset(gamelog,MatchNo==200901 & Inning==1,c(TeamBowling,TeamBatting)))
$Format[gamelog$Format==" "] = "IPL"
gamelog$MatchNo[(gamelog$MatchNo==200901 & gamelog$TeamBowling=="MI" &
gamelog$TeamBatting=="CSK")|
gamelog$MatchNo==200901 & gamelog$TeamBowling=="CSK" &
(gamelog$TeamBatting=="MI")] = 2009011
gamelog$MatchNo[(gamelog$MatchNo==200901 & gamelog$TeamBowling=="RR" &
gamelog$TeamBatting=="SH")|
gamelog$MatchNo==200901 & gamelog$TeamBowling=="SH" &
(gamelog$TeamBatting=="RR")] = 2009012
gamelog$MatchNo[(gamelog$MatchNo==200901 & gamelog$TeamBowling=="MI" &
gamelog$TeamBatting=="RR")|
gamelog$MatchNo==200901 & gamelog$TeamBowling=="RR" &
(gamelog$TeamBatting=="MI")] = 2009013
gamelog
<- subset(gamelog,!(MatchNo==200901 & TeamBowling %in% c("KXP","KKR") &
gamelog %in% c("KXP","KKR")))
TeamBatting
<- gamelog[c(1:77751,77994:160685),] #Remove duplicates of #200921
gamelog
<- gamelog %>% group_by(MatchNo) %>%
innings_check summarise(innings=sum(length(unique(Inning))==2) + 1)
#Matches with only one innings
<- merge(agg,innings_check[innings_check$innings==1,])
innings_check
<- gamelog %>% dplyr::filter(duplicated(FullNotes) & FullNotes != "") %>%
dups group_by(MatchNo) %>% summarise(dups = length(FullNotes))
<- unique(dups %>% filter(!(MatchNo %in% dup_matches) & dups > 50)
dup_innings %>% pull(MatchNo))
# Label second innings and switch bowling and batting teams at change of innings and reset wickets
$MatchNo==440 & gamelog$Bowler != 'Mpofu','TeamBowling'] <- 'ZIM'
gamelog[gamelog$MatchNo==440 & gamelog$Bowler != 'Mpofu','TeamBatting'] <- 'MI'
gamelog[gamelog$MatchNo==440 & gamelog$Bowler != 'Mpofu','Inning'] <- 2
gamelog[gamelog$MatchNo==440 & gamelog$Bowler != 'Mpofu','Wickets'] <-
gamelog[gamelog$MatchNo==440 & gamelog$Bowler != 'Mpofu','Wickets'] - 3
gamelog[gamelog
$MatchNo==200903 & (is.na(gamelog$BatsmanID) |
gamelog[gamelog$BatsmanID==3027),'TeamBowling'] <- 'KXP'
gamelog$MatchNo==200903 & (is.na(gamelog$BatsmanID) |
gamelog[gamelog$BatsmanID==3027),'TeamBatting'] <- 'DD'
gamelog$MatchNo==200903 & (is.na(gamelog$BatsmanID) |
gamelog[gamelog$BatsmanID==3027),'Inning'] <- 2
gamelog$MatchNo==200903 & (is.na(gamelog$BatsmanID) |
gamelog[gamelog$BatsmanID==3027),'Wickets'] <- 1
gamelog
$MatchNo==200906 & (gamelog$Wickets > 6 |
gamelog[gamelog$Wickets==6 & gamelog$Over < 17)),'TeamBowling'] <- 'KXP'
(gamelog$MatchNo==200906 & (gamelog$Wickets > 6 |
gamelog[gamelog$Wickets==6 & gamelog$Over < 17)),'TeamBatting'] <- 'KKR'
(gamelog$MatchNo==200906 & (gamelog$Wickets > 6 |
gamelog[gamelog$Wickets==6 & gamelog$Over < 17)),'Inning'] <- 2
(gamelog$MatchNo==200906 & (gamelog$Wickets > 6 |
gamelog[gamelog$Wickets==6 & gamelog$Over < 17)),'Wickets'] <-
(gamelogwith(gamelog,gamelog[MatchNo==200906 & (Wickets > 6 |
$Wickets==6 & Over < 17)),'Wickets']) - 6
(gamelog
$MatchNo==200938 & (gamelog$Wickets > 2 |
gamelog[gamelog$Wickets==2 & gamelog$Over < 8)),'TeamBowling'] <- 'MI'
(gamelog$MatchNo==200938 & (gamelog$Wickets > 2 |
gamelog[gamelog$Wickets==2 & gamelog$Over < 8)),'TeamBatting'] <- 'RCB'
(gamelog$MatchNo==200938 & (gamelog$Wickets > 2 |
gamelog[gamelog$Wickets==2 & gamelog$Over < 8)),'Inning'] <- 2
(gamelog$MatchNo==200938 & (gamelog$Wickets > 2 |
gamelog[gamelog$Wickets==2 & gamelog$Over < 8)),'Wickets'] <-
(gamelogwith(gamelog,gamelog[MatchNo==200938 & (Wickets > 2 |
==2 & Over < 8)),'Wickets']) - 2
(Wickets
$MatchNo==200947 & (gamelog$Wickets > 4 |
gamelog[gamelog$Wickets==4 & gamelog$Over < 19)),'TeamBowling'] <- 'MI'
(gamelog$MatchNo==200947 & (gamelog$Wickets > 4 |
gamelog[gamelog$Wickets==4 & gamelog$Over < 19)),'TeamBatting'] <- 'CSK'
(gamelog$MatchNo==200947 & (gamelog$Wickets > 4 |
gamelog[gamelog$Wickets==4 & gamelog$Over < 19)),'Inning'] <- 2
(gamelog$MatchNo==200947 & (gamelog$Wickets > 4 |
gamelog[gamelog$Wickets==4 & gamelog$Over < 19)),'Wickets'] <-
(gamelogwith(gamelog,gamelog[MatchNo==200947 & (Wickets > 4 |
==4 & Over < 19)),'Wickets']) - 4
(Wickets
$MatchNo==200948 & (gamelog$Wickets > 5 |
gamelog[gamelog$Wickets==5 & gamelog$Over < 19)),'TeamBowling'] <- 'KKR'
(gamelog$MatchNo==200948 & (gamelog$Wickets > 5 |
gamelog[gamelog$Wickets==5 & gamelog$Over < 19)),'TeamBatting'] <- 'DC'
(gamelog$MatchNo==200948 & (gamelog$Wickets > 5 |
gamelog[gamelog$Wickets==5 & gamelog$Over < 19)),'Inning'] <- 2
(gamelog$MatchNo==200948 & (gamelog$Wickets > 5 |
gamelog[gamelog$Wickets==5 & gamelog$Over < 19)),'Wickets'] <-
(gamelogwith(gamelog,gamelog[MatchNo==200948 & (Wickets > 5 |
==5 & Over < 19)),'Wickets']) - 5
(Wickets
$MatchNo==201103 & (gamelog$Wickets > 5 |
gamelog[gamelog$Wickets==5 & gamelog$Over < 18)),'TeamBowling'] <- 'KTK'
(gamelog$MatchNo==201103 & (gamelog$Wickets > 5 |
gamelog[gamelog$Wickets==5 & gamelog$Over < 18)),'TeamBatting'] <- 'RCB'
(gamelog$MatchNo==201103 & (gamelog$Wickets > 5 |
gamelog[gamelog$Wickets==5 & gamelog$Over < 18)),'Inning'] <- 2
(gamelog$MatchNo==201103 & (gamelog$Wickets > 5 |
gamelog[gamelog$Wickets==5 & gamelog$Over < 18)),'Wickets'] <-
(gamelogwith(gamelog,gamelog[MatchNo==201103 & (Wickets > 5 |
==5 & Over < 18)),'Wickets']) - 5
(Wickets
$MatchNo==201109 & gamelog$Wickets > 2,'TeamBowling'] <- 'CSK'
gamelog[gamelog$MatchNo==201109 & gamelog$Wickets > 2,'TeamBatting'] <- 'KXP'
gamelog[gamelog$MatchNo==201109 & gamelog$Wickets > 2,'Inning'] <- 2
gamelog[gamelog$MatchNo==201109 & gamelog$Wickets > 2,'Wickets'] <-
gamelog[gamelogwith(gamelog,gamelog[MatchNo==201109 & Wickets > 2,'Wickets']) - 2
$MatchNo==201112 & (gamelog$Wickets > 4 |
gamelog[gamelog$Wickets==4 & gamelog$Over < 16)),'TeamBowling'] <- 'RR'
(gamelog$MatchNo==201112 & (gamelog$Wickets > 4 |
gamelog[gamelog$Wickets==4 & gamelog$Over < 16)),'TeamBatting'] <- 'KKR'
(gamelog$MatchNo==201112 & (gamelog$Wickets > 4 |
gamelog[gamelog$Wickets==4 & gamelog$Over < 16)),'Inning'] <- 2
(gamelog$MatchNo==201112 & (gamelog$Wickets > 4 |
gamelog[gamelog$Wickets==4 & gamelog$Over < 16)),'Wickets'] <-
(gamelogwith(gamelog,gamelog[MatchNo==201112 & (Wickets > 4 |
==4 & Over < 16)),'Wickets']) - 4
(Wickets
$MatchNo==201131 & (gamelog$Wickets > 6 |
gamelog[gamelog$Wickets==6 & gamelog$Over < 18)),'TeamBowling'] <- 'PW'
(gamelog$MatchNo==201131 & (gamelog$Wickets > 6 |
gamelog[gamelog$Wickets==6 & gamelog$Over < 18)),'TeamBatting'] <- 'CSK'
(gamelog$MatchNo==201131 & (gamelog$Wickets > 6 |
gamelog[gamelog$Wickets==6 & gamelog$Over < 18)),'Inning'] <- 2
(gamelog$MatchNo==201131 & (gamelog$Wickets > 6 |
gamelog[gamelog$Wickets==6 & gamelog$Over < 18)),'Wickets'] <-
(gamelogwith(gamelog,gamelog[MatchNo==201131 & (Wickets > 6 |
==6 & Over < 18)),'Wickets']) - 6
(Wickets
$MatchNo==201211 & (gamelog$Wickets > 6 |
gamelog[gamelog$Wickets==6 & gamelog$Over < 18)),'TeamBowling'] <- 'CSK'
(gamelog$MatchNo==201211 & (gamelog$Wickets > 6 |
gamelog[gamelog$Wickets==6 & gamelog$Over < 18)),'TeamBatting'] <- 'DD'
(gamelog$MatchNo==201211 & (gamelog$Wickets > 6 |
gamelog[gamelog$Wickets==6 & gamelog$Over < 18)),'Inning'] <- 2
(gamelog$MatchNo==201211 & (gamelog$Wickets > 6 |
gamelog[gamelog$Wickets==6 & gamelog$Over < 18)),'Wickets'] <-
(gamelogwith(gamelog,gamelog[MatchNo==201211 & (Wickets > 6 |
==6 & Over < 18)),'Wickets']) - 6
(Wickets
$MatchNo==201228 & (gamelog$Wickets > 4 |
gamelog[gamelog$Wickets==4 & gamelog$Over < 18)),'TeamBowling'] <- 'MI'
(gamelog$MatchNo==201228 & (gamelog$Wickets > 4 |
gamelog[gamelog$Wickets==4 & gamelog$Over < 18)),'TeamBatting'] <- 'KXP'
(gamelog$MatchNo==201228 & (gamelog$Wickets > 4 |
gamelog[gamelog$Wickets==4 & gamelog$Over < 18)),'Inning'] <- 2
(gamelog$MatchNo==201228 & (gamelog$Wickets > 4 |
gamelog[gamelog$Wickets==4 & gamelog$Over < 18)),'Wickets'] <-
(gamelogwith(gamelog,gamelog[MatchNo==201228 & (Wickets > 4 |
==4 & Over < 18)),'Wickets']) - 4
(Wickets
$MatchNo==201258 & (gamelog$Wickets > 1 |
gamelog[gamelog$Wickets==1 & gamelog$Over ==0)),'TeamBowling'] <- 'MI'
(gamelog$MatchNo==201258 & (gamelog$Wickets > 1 |
gamelog[gamelog$Wickets==1 & gamelog$Over ==0)),'TeamBatting'] <- 'KKR'
(gamelog$MatchNo==201258 & (gamelog$Wickets > 1 |
gamelog[gamelog$Wickets==1 & gamelog$Over ==0)),'Inning'] <- 2
(gamelog$MatchNo==201258 & (gamelog$Wickets > 1 |
gamelog[gamelog$Wickets==1 & gamelog$Over ==0)),'Wickets'] <-
(gamelogwith(gamelog,gamelog[MatchNo==201258 & (Wickets > 1 |
==1 & Over ==0)),'Wickets']) - 1
(Wickets
# Remove duplicate innings
<- subset(gamelog,!(MatchNo %in% dup_innings & Inning==2))
gamelog
= subset(gamelog,!NumOutcome==2015) #Nonsense rows gamelog
Next we take a look at missing data:
library(mice)
md.pattern(gamelog,rotate.names=TRUE)
## Format MatchNo TeamBowling TeamBatting Inning Over Ball Bowler Batsman Fielder Outcome NumOutcome BallType
## 12160 1 1 1 1 1 1 1 1 1 1 1 1 1
## 134490 1 1 1 1 1 1 1 1 1 1 1 1 1
## 274 1 1 1 1 1 1 1 1 1 1 1 1 1
## 4696 1 1 1 1 1 1 1 1 1 1 1 1 1
## 224 1 1 1 1 1 1 1 1 1 1 1 1 1
## 3359 1 1 1 1 1 1 1 1 1 1 1 1 1
## 197 1 1 1 1 1 1 1 1 1 1 1 1 1
## 2663 1 1 1 1 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0 0 0 0 0
## NumBallType Notes FullNotes IDflag Wickets ResourcesLeft RunsScored DLSDifferential Sentiment highlight_worthiness
## 12160 1 1 1 1 1 1 1 1 1 1
## 134490 1 1 1 1 1 1 1 1 1 1
## 274 1 1 1 1 1 1 1 1 1 1
## 4696 1 1 1 1 1 1 1 1 1 1
## 224 1 1 1 1 1 1 1 1 1 1
## 3359 1 1 1 1 1 1 1 1 1 1
## 197 1 1 1 1 1 1 1 1 1 1
## 2663 1 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0 0
## stdized_sentiment ground_gained ground_gained_score BowlerID BatsmanID FielderID
## 12160 1 1 1 1 1 1 0
## 134490 1 1 1 1 1 0 1
## 274 1 1 1 1 0 1 1
## 4696 1 1 1 1 0 0 2
## 224 1 1 1 0 1 1 1
## 3359 1 1 1 0 1 0 2
## 197 1 1 1 0 0 1 2
## 2663 1 1 1 0 0 0 3
## 0 0 0 6443 7830 145208 159481
All of the missing items fall under the BowlerID
, BatsmanID
, and FielderID
fields. Since we are unlikely to focus on specific bowlers/batsmen/fielders, this does not bother us too much.