2 Examining and Cleaning the Data

We begin with a preliminary examination of the data we will be working with:

gamelog = read.csv("Gamelog T20I.csv")
dim(gamelog)
## [1] 168966     21
matches = length(unique(gamelog$MatchNo))
print(matches)
## [1] 688
head(gamelog, n=3)
##   Format MatchNo TeamBowling TeamBatting Inning Over Ball Bowler BowlerID     Batsman BatsmanID Fielder FielderID Outcome
## 1   T20I      33         AUS          BD      1    0    1  B Lee       17 Tamim Iqbal      1041                NA      no
## 2   T20I      33         AUS          BD      1    0    2  B Lee       17 Tamim Iqbal      1041                NA      no
## 3   T20I      33         AUS          BD      1    0    3  B Lee       17 Tamim Iqbal      1041                NA      no
##   NumOutcome BallType NumBallType Notes
## 1          0      run           0  good
## 2          0      run           0 short
## 3          0      run           0 short
##                                                                                       FullNotes IDflag Wickets
## 1    good start by Lee   dug in short of a length outside off, gives Tamim a lifter, left alone      0       0
## 2 short of a length outside off again   this time Tamim gets some bat on it and pushes to point      0       0
## 3    short again and aimed at the body   Tamim gets on the backfoot and defends to the off side      0       0
summary(subset(gamelog,select=c(MatchNo,Inning,Over,Ball,Outcome,NumOutcome,IDflag,Wickets)))
##     MatchNo           Inning           Over             Ball         Outcome            NumOutcome           IDflag       
##  Min.   :     1   Min.   :1.000   Min.   : 0.000   Min.   :1.000   Length:168966      Min.   :  -1.000   Min.   :0.00000  
##  1st Qu.:   244   1st Qu.:1.000   1st Qu.: 4.000   1st Qu.:2.000   Class :character   1st Qu.:   0.000   1st Qu.:0.00000  
##  Median :200938   Median :1.000   Median : 9.000   Median :3.000   Mode  :character   Median :   1.000   Median :0.00000  
##  Mean   :112498   Mean   :1.455   Mean   : 9.127   Mean   :3.488                      Mean   :   1.238   Mean   :0.09382  
##  3rd Qu.:201316   3rd Qu.:2.000   3rd Qu.:14.000   3rd Qu.:5.000                      3rd Qu.:   1.000   3rd Qu.:0.00000  
##  Max.   :201560   Max.   :2.000   Max.   :19.000   Max.   :6.000                      Max.   :2015.000   Max.   :3.00000  
##                                                                                       NA's   :16                          
##     Wickets      
##  Min.   : 0.000  
##  1st Qu.: 1.000  
##  Median : 2.000  
##  Mean   : 2.553  
##  3rd Qu.: 4.000  
##  Max.   :24.000  
## 

We see that there are a total of 168,966 balls thrown over 688 unique games. We convert several character variables to factors where it makes sense to do so:

gamelog$Format = factor(gamelog$Format)
levels(gamelog$Format)
## [1] " "    "IPL"  "T20I"
gamelog$Outcome = factor(gamelog$Outcome)
levels(gamelog$Outcome)
##  [1] ""     "1"    "2"    "2015" "3"    "4"    "5"    "6"    "7"    "FOUR" "no"   "OUT"  "SIX"
gamelog$BallType= factor(gamelog$BallType)

According to the rules of Twenty20 Cricket, there can only be up to 9 wickets in an innings, but we see above that there are some balls in which the wicket is higher than 9, indeed up to 24. This warrants some inspection, and gives rise to a host of issues with the data, including:

  • wickets accumulating further rather than resetting to 0 at the start of the 2nd innings
  • some matches not labeling the second innings (i.e. all observations are labelled as innings 1)
  • some matches in which the batting and bowling teams did not switch when the innings transitioned from first to second
  • duplicate matches and innings
  • multiple matches having the same MatchNo (match 200901, which corresponds to the rows having empty format)
  • Strange values for NumOutcome variable (i.e. 2015)

We tackle all these problems with the following code:

library(dplyr)

gamelog = distinct(gamelog) # Remove duplicates

for (matchID in unique(gamelog$MatchNo[gamelog$Wickets>9])) {
  
  # Check if the dataset missed the transition from first to second inning
  if (nrow(unique(subset(gamelog,MatchNo==matchID,Inning))) == 1) {
    # 2 possibilities:
    
    # 1. The first inning ended after 20 overs
    if (nrow(subset(gamelog,MatchNo==matchID & Over==19 & Ball==6 & Wickets < 10)) > 0) {
      
      #Number of wickets on last ball of first innings
      wickets = as.numeric(subset(gamelog,MatchNo==matchID & Over==19 & Ball==6 & Wickets < 10 &
                                  !NumBallType %in% c(1,2),Wickets)[1]) 
      
      #Properly label 2nd innings
      gamelog$Inning[gamelog$MatchNo==matchID & gamelog$Wickets > wickets] = 2 
      
      #Properly label 2nd innings
      gamelog$Inning[gamelog$MatchNo==matchID & gamelog$Wickets==wickets & 
                     gamelog$Over < 10] = 2 
      
      #Reset wickets
      gamelog$Wickets[gamelog$Inning==2 & gamelog$MatchNo==matchID] = 
          gamelog$Wickets[gamelog$Inning==2 & gamelog$MatchNo==matchID] - wickets 
    }
    
    #2. The first inning ended after 10 wickets
    else if (nrow(subset(gamelog,Wickets==9 & NumOutcome==-1))>0) {
      
      #Properly label 2nd innings
      gamelog$Inning[gamelog$MatchNo==matchID & gamelog$Wickets > 9] = 2 
      
      #Reset wickets
      gamelog$Wickets[gamelog$MatchNo==matchID & gamelog$Inning==2] = 
              gamelog$Wickets[gamelog$MatchNo==matchID & gamelog$Inning==2] - 10 
    }
    
  }
}

#Filter out random rows with nonsensical data
gamelog = subset(gamelog,!(MatchNo %in% c(200943,201149,201212,201323,201356,201558) & Wickets==10)) 

# Filter out duplicate matches and innings based on number of plays which have 
#the exact same commentary

dups <- gamelog %>% dplyr::filter(duplicated(FullNotes) & FullNotes != "") %>% 
        dplyr::group_by(MatchNo) %>% dplyr::summarise(dups = length(FullNotes))
agg <- gamelog %>% dplyr::group_by(MatchNo) %>% dplyr::summarise(plays = length(FullNotes))
check <- merge(dups,agg)
dup_matches <- unique(check %>% filter(plays - dups < 10) %>% pull(MatchNo))

gamelog <- subset(gamelog,!MatchNo %in% dup_matches) #Remove duplicate matches

dups %>% dplyr::filter(!(MatchNo %in% dup_matches) & dups > 50)

unique(subset(gamelog,MatchNo==200901 & Inning==1,c(TeamBowling,TeamBatting)))
gamelog$Format[gamelog$Format==" "] = "IPL"
gamelog$MatchNo[(gamelog$MatchNo==200901 & gamelog$TeamBowling=="MI" & 
                   gamelog$TeamBatting=="CSK")|
                (gamelog$MatchNo==200901 & gamelog$TeamBowling=="CSK" & 
                   gamelog$TeamBatting=="MI")] = 2009011
gamelog$MatchNo[(gamelog$MatchNo==200901 & gamelog$TeamBowling=="RR" & 
                   gamelog$TeamBatting=="SH")|
                (gamelog$MatchNo==200901 & gamelog$TeamBowling=="SH" & 
                   gamelog$TeamBatting=="RR")] = 2009012
gamelog$MatchNo[(gamelog$MatchNo==200901 & gamelog$TeamBowling=="MI" & 
                   gamelog$TeamBatting=="RR")|
                (gamelog$MatchNo==200901 & gamelog$TeamBowling=="RR" & 
                   gamelog$TeamBatting=="MI")] = 2009013

gamelog <- subset(gamelog,!(MatchNo==200901 & TeamBowling %in% c("KXP","KKR") &
                          TeamBatting %in% c("KXP","KKR")))

gamelog <- gamelog[c(1:77751,77994:160685),] #Remove duplicates of #200921

innings_check <- gamelog %>% group_by(MatchNo) %>% 
                 summarise(innings=sum(length(unique(Inning))==2) + 1)

#Matches with only one innings
innings_check <- merge(agg,innings_check[innings_check$innings==1,]) 

dups <- gamelog %>% dplyr::filter(duplicated(FullNotes) & FullNotes != "") %>% 
        group_by(MatchNo) %>% summarise(dups = length(FullNotes))

dup_innings <- unique(dups %>% filter(!(MatchNo %in% dup_matches) & dups > 50) 
                      %>% pull(MatchNo))

# Label second innings and switch bowling and batting teams at change of innings and reset wickets

gamelog[gamelog$MatchNo==440 & gamelog$Bowler != 'Mpofu','TeamBowling'] <- 'ZIM'
gamelog[gamelog$MatchNo==440 & gamelog$Bowler != 'Mpofu','TeamBatting'] <- 'MI'
gamelog[gamelog$MatchNo==440 & gamelog$Bowler != 'Mpofu','Inning'] <- 2
gamelog[gamelog$MatchNo==440 & gamelog$Bowler != 'Mpofu','Wickets'] <- 
  gamelog[gamelog$MatchNo==440 & gamelog$Bowler != 'Mpofu','Wickets'] - 3

gamelog[gamelog$MatchNo==200903 & (is.na(gamelog$BatsmanID) | 
                                     gamelog$BatsmanID==3027),'TeamBowling'] <- 'KXP'
gamelog[gamelog$MatchNo==200903 & (is.na(gamelog$BatsmanID) | 
                                     gamelog$BatsmanID==3027),'TeamBatting'] <- 'DD'
gamelog[gamelog$MatchNo==200903 & (is.na(gamelog$BatsmanID) | 
                                     gamelog$BatsmanID==3027),'Inning'] <- 2
gamelog[gamelog$MatchNo==200903 & (is.na(gamelog$BatsmanID) | 
                                     gamelog$BatsmanID==3027),'Wickets'] <- 1

gamelog[gamelog$MatchNo==200906 & (gamelog$Wickets > 6 | 
                                  (gamelog$Wickets==6 & gamelog$Over < 17)),'TeamBowling'] <- 'KXP'
gamelog[gamelog$MatchNo==200906 & (gamelog$Wickets > 6 | 
                                  (gamelog$Wickets==6 & gamelog$Over < 17)),'TeamBatting'] <- 'KKR'
gamelog[gamelog$MatchNo==200906 & (gamelog$Wickets > 6 | 
                                  (gamelog$Wickets==6 & gamelog$Over < 17)),'Inning'] <- 2
gamelog[gamelog$MatchNo==200906 & (gamelog$Wickets > 6 | 
                                  (gamelog$Wickets==6 & gamelog$Over < 17)),'Wickets'] <- 
  with(gamelog,gamelog[MatchNo==200906 & (Wickets > 6 | 
                       (gamelog$Wickets==6 & Over < 17)),'Wickets']) - 6

gamelog[gamelog$MatchNo==200938 & (gamelog$Wickets > 2 | 
        (gamelog$Wickets==2 & gamelog$Over < 8)),'TeamBowling'] <- 'MI'
gamelog[gamelog$MatchNo==200938 & (gamelog$Wickets > 2 | 
        (gamelog$Wickets==2 & gamelog$Over < 8)),'TeamBatting'] <- 'RCB'
gamelog[gamelog$MatchNo==200938 & (gamelog$Wickets > 2 | 
        (gamelog$Wickets==2 & gamelog$Over < 8)),'Inning'] <- 2
gamelog[gamelog$MatchNo==200938 & (gamelog$Wickets > 2 | 
        (gamelog$Wickets==2 & gamelog$Over < 8)),'Wickets'] <- 
  with(gamelog,gamelog[MatchNo==200938 & (Wickets > 2 | 
                      (Wickets==2 & Over < 8)),'Wickets']) - 2

gamelog[gamelog$MatchNo==200947 & (gamelog$Wickets > 4 | 
       (gamelog$Wickets==4 & gamelog$Over < 19)),'TeamBowling'] <- 'MI'
gamelog[gamelog$MatchNo==200947 & (gamelog$Wickets > 4 | 
       (gamelog$Wickets==4 & gamelog$Over < 19)),'TeamBatting'] <- 'CSK'
gamelog[gamelog$MatchNo==200947 & (gamelog$Wickets > 4 | 
        (gamelog$Wickets==4 & gamelog$Over < 19)),'Inning'] <- 2
gamelog[gamelog$MatchNo==200947 & (gamelog$Wickets > 4 | 
        (gamelog$Wickets==4 & gamelog$Over < 19)),'Wickets'] <- 
  with(gamelog,gamelog[MatchNo==200947 & (Wickets > 4 | 
       (Wickets==4 & Over < 19)),'Wickets']) - 4

gamelog[gamelog$MatchNo==200948 & (gamelog$Wickets > 5 | 
       (gamelog$Wickets==5 & gamelog$Over < 19)),'TeamBowling'] <- 'KKR'
gamelog[gamelog$MatchNo==200948 & (gamelog$Wickets > 5 | 
       (gamelog$Wickets==5 & gamelog$Over < 19)),'TeamBatting'] <- 'DC'
gamelog[gamelog$MatchNo==200948 & (gamelog$Wickets > 5 | 
        (gamelog$Wickets==5 & gamelog$Over < 19)),'Inning'] <- 2
gamelog[gamelog$MatchNo==200948 & (gamelog$Wickets > 5 | 
       (gamelog$Wickets==5 & gamelog$Over < 19)),'Wickets'] <- 
  with(gamelog,gamelog[MatchNo==200948 & (Wickets > 5 | 
      (Wickets==5 & Over < 19)),'Wickets']) - 5

gamelog[gamelog$MatchNo==201103 & (gamelog$Wickets > 5 | 
        (gamelog$Wickets==5 & gamelog$Over < 18)),'TeamBowling'] <- 'KTK'
gamelog[gamelog$MatchNo==201103 & (gamelog$Wickets > 5 | 
        (gamelog$Wickets==5 & gamelog$Over < 18)),'TeamBatting'] <- 'RCB'
gamelog[gamelog$MatchNo==201103 & (gamelog$Wickets > 5 | 
        (gamelog$Wickets==5 & gamelog$Over < 18)),'Inning'] <- 2
gamelog[gamelog$MatchNo==201103 & (gamelog$Wickets > 5 | 
       (gamelog$Wickets==5 & gamelog$Over < 18)),'Wickets'] <- 
  with(gamelog,gamelog[MatchNo==201103 & (Wickets > 5 | 
      (Wickets==5 & Over < 18)),'Wickets']) - 5

gamelog[gamelog$MatchNo==201109 & gamelog$Wickets > 2,'TeamBowling'] <- 'CSK'
gamelog[gamelog$MatchNo==201109 & gamelog$Wickets > 2,'TeamBatting'] <- 'KXP'
gamelog[gamelog$MatchNo==201109 & gamelog$Wickets > 2,'Inning'] <- 2
gamelog[gamelog$MatchNo==201109 & gamelog$Wickets > 2,'Wickets'] <- 
  with(gamelog,gamelog[MatchNo==201109 & Wickets > 2,'Wickets']) - 2

gamelog[gamelog$MatchNo==201112 & (gamelog$Wickets > 4 | 
        (gamelog$Wickets==4 & gamelog$Over < 16)),'TeamBowling'] <- 'RR'
gamelog[gamelog$MatchNo==201112 & (gamelog$Wickets > 4 | 
        (gamelog$Wickets==4 & gamelog$Over < 16)),'TeamBatting'] <- 'KKR'
gamelog[gamelog$MatchNo==201112 & (gamelog$Wickets > 4 | 
        (gamelog$Wickets==4 & gamelog$Over < 16)),'Inning'] <- 2
gamelog[gamelog$MatchNo==201112 & (gamelog$Wickets > 4 | 
        (gamelog$Wickets==4 & gamelog$Over < 16)),'Wickets'] <- 
  with(gamelog,gamelog[MatchNo==201112 & (Wickets > 4 | 
        (Wickets==4 & Over < 16)),'Wickets']) - 4

gamelog[gamelog$MatchNo==201131 & (gamelog$Wickets > 6 | 
        (gamelog$Wickets==6 & gamelog$Over < 18)),'TeamBowling'] <- 'PW'
gamelog[gamelog$MatchNo==201131 & (gamelog$Wickets > 6 | 
        (gamelog$Wickets==6 & gamelog$Over < 18)),'TeamBatting'] <- 'CSK'
gamelog[gamelog$MatchNo==201131 & (gamelog$Wickets > 6 | 
        (gamelog$Wickets==6 & gamelog$Over < 18)),'Inning'] <- 2
gamelog[gamelog$MatchNo==201131 & (gamelog$Wickets > 6 | 
        (gamelog$Wickets==6 & gamelog$Over < 18)),'Wickets'] <- 
  with(gamelog,gamelog[MatchNo==201131 & (Wickets > 6 | 
      (Wickets==6 & Over < 18)),'Wickets']) - 6

gamelog[gamelog$MatchNo==201211 & (gamelog$Wickets > 6 | 
       (gamelog$Wickets==6 & gamelog$Over < 18)),'TeamBowling'] <- 'CSK'
gamelog[gamelog$MatchNo==201211 & (gamelog$Wickets > 6 | 
       (gamelog$Wickets==6 & gamelog$Over < 18)),'TeamBatting'] <- 'DD'
gamelog[gamelog$MatchNo==201211 & (gamelog$Wickets > 6 | 
       (gamelog$Wickets==6 & gamelog$Over < 18)),'Inning'] <- 2
gamelog[gamelog$MatchNo==201211 & (gamelog$Wickets > 6 | 
       (gamelog$Wickets==6 & gamelog$Over < 18)),'Wickets'] <- 
  with(gamelog,gamelog[MatchNo==201211 & (Wickets > 6 | 
      (Wickets==6 & Over < 18)),'Wickets']) - 6

gamelog[gamelog$MatchNo==201228 & (gamelog$Wickets > 4 | 
       (gamelog$Wickets==4 & gamelog$Over < 18)),'TeamBowling'] <- 'MI'
gamelog[gamelog$MatchNo==201228 & (gamelog$Wickets > 4 | 
       (gamelog$Wickets==4 & gamelog$Over < 18)),'TeamBatting'] <- 'KXP'
gamelog[gamelog$MatchNo==201228 & (gamelog$Wickets > 4 | 
       (gamelog$Wickets==4 & gamelog$Over < 18)),'Inning'] <- 2
gamelog[gamelog$MatchNo==201228 & (gamelog$Wickets > 4 | 
       (gamelog$Wickets==4 & gamelog$Over < 18)),'Wickets'] <- 
  with(gamelog,gamelog[MatchNo==201228 & (Wickets > 4 | 
       (Wickets==4 & Over < 18)),'Wickets']) - 4

gamelog[gamelog$MatchNo==201258 & (gamelog$Wickets > 1 | 
       (gamelog$Wickets==1 & gamelog$Over ==0)),'TeamBowling'] <- 'MI'
gamelog[gamelog$MatchNo==201258 & (gamelog$Wickets > 1 | 
       (gamelog$Wickets==1 & gamelog$Over ==0)),'TeamBatting'] <- 'KKR'
gamelog[gamelog$MatchNo==201258 & (gamelog$Wickets > 1 | 
       (gamelog$Wickets==1 & gamelog$Over ==0)),'Inning'] <- 2
gamelog[gamelog$MatchNo==201258 & (gamelog$Wickets > 1 | 
       (gamelog$Wickets==1 & gamelog$Over ==0)),'Wickets'] <- 
  with(gamelog,gamelog[MatchNo==201258 & (Wickets > 1 | 
       (Wickets==1 & Over ==0)),'Wickets']) - 1

# Remove duplicate innings
gamelog <- subset(gamelog,!(MatchNo %in% dup_innings & Inning==2)) 

gamelog = subset(gamelog,!NumOutcome==2015) #Nonsense rows

Next we take a look at missing data:

library(mice)
md.pattern(gamelog,rotate.names=TRUE)

##        Format MatchNo TeamBowling TeamBatting Inning Over Ball Bowler Batsman Fielder Outcome NumOutcome BallType
## 12160       1       1           1           1      1    1    1      1       1       1       1          1        1
## 134490      1       1           1           1      1    1    1      1       1       1       1          1        1
## 274         1       1           1           1      1    1    1      1       1       1       1          1        1
## 4696        1       1           1           1      1    1    1      1       1       1       1          1        1
## 224         1       1           1           1      1    1    1      1       1       1       1          1        1
## 3359        1       1           1           1      1    1    1      1       1       1       1          1        1
## 197         1       1           1           1      1    1    1      1       1       1       1          1        1
## 2663        1       1           1           1      1    1    1      1       1       1       1          1        1
##             0       0           0           0      0    0    0      0       0       0       0          0        0
##        NumBallType Notes FullNotes IDflag Wickets ResourcesLeft RunsScored DLSDifferential Sentiment highlight_worthiness
## 12160            1     1         1      1       1             1          1               1         1                    1
## 134490           1     1         1      1       1             1          1               1         1                    1
## 274              1     1         1      1       1             1          1               1         1                    1
## 4696             1     1         1      1       1             1          1               1         1                    1
## 224              1     1         1      1       1             1          1               1         1                    1
## 3359             1     1         1      1       1             1          1               1         1                    1
## 197              1     1         1      1       1             1          1               1         1                    1
## 2663             1     1         1      1       1             1          1               1         1                    1
##                  0     0         0      0       0             0          0               0         0                    0
##        stdized_sentiment ground_gained ground_gained_score BowlerID BatsmanID FielderID       
## 12160                  1             1                   1        1         1         1      0
## 134490                 1             1                   1        1         1         0      1
## 274                    1             1                   1        1         0         1      1
## 4696                   1             1                   1        1         0         0      2
## 224                    1             1                   1        0         1         1      1
## 3359                   1             1                   1        0         1         0      2
## 197                    1             1                   1        0         0         1      2
## 2663                   1             1                   1        0         0         0      3
##                        0             0                   0     6443      7830    145208 159481

All of the missing items fall under the BowlerID, BatsmanID, and FielderID fields. Since we are unlikely to focus on specific bowlers/batsmen/fielders, this does not bother us too much.