##########

# the following section of code is from chancesis

# http://www.chancesis.com/2011/08/14/run-expectancy-and-markov-chains/

##########

# change directory path to wherever you have bo_transitions.csv, etc saved

setwd("/Users/Seshoumaru/Desktop/untitled folder/baseball/run-win expectancy")

bo.trans = read.csv("bo_transitions.csv", header = TRUE)

# create header names

start.bo = NULL

end.bo = NULL

bo = NULL

bases = c('___','1__','_2_','12_','__3','1_3','_23','123')

for (b in 0:7) {

  for (o in 0:3) {

    start.bo[b + 8*o + 1] = paste('s.b', bases[b+1], '.o', o, sep = '')

    end.bo[b + 8*o + 1] = paste('e.b', bases[b+1], '.o', o, sep = '')

    bo[b + 8*o + 1] = paste('b', bases[b+1], '.o', o, sep = '')

  }

}

# trans.freq - the frequency of each state transition

# trans.runs - the runs scored on each state transition

trans.freq = array(0, c(32,32), dimnames = list(start.bo, end.bo))

trans.runs = array(0, c(32,32), dimnames = list(start.bo, end.bo))

for (r in 1:nrow(bo.trans)) {

  row.num = bo.trans$START_BASES[r] + 8*bo.trans$START_OUTS[r] + 1

  col.num = bo.trans$END_BASES[r] + 8*bo.trans$END_OUTS[r] + 1

  trans.freq[row.num,col.num] = trans.freq[row.num,col.num] + bo.trans$PA[r]

  trans.runs[row.num,col.num] = trans.runs[row.num,col.num] + bo.trans$EVENT_RUNS[r]

}

# trans.prob - the probability of each state transition

trans.prob = trans.freq

for (r in 1:24) {

  trans.prob[r,] = trans.prob[r,]/sum(trans.prob[r,])

}

for (r in 25:32) {

  trans.prob[r,25] = 1

}

# re & run distribution, simulation method

innings = 100000  # ~4 mins for 100,000, ~1 hour for 1,000,000 on my computer

run.dist.simulation = array(0,c(24,1))

run.exp.simulation = array(0,c(24,1))

for (s in 1:24) {

  for (i in 1:innings) {

    s.state = s

    runs.inning = 0

    while (s.state != 25) {

      e.state = sample(1:ncol(trans.prob), size = 1, prob = trans.prob[s.state,])

      runs.inning = runs.inning + as.numeric(trans.runs[s.state,e.state])

      s.state = e.state

    }

    if (runs.inning+1 > ncol(run.dist.simulation)) {

      for (l in ncol(run.dist.simulation):runs.inning) {

        run.dist.simulation = cbind(run.dist.simulation,array(0,c(24,1)))

      }

    }

    run.dist.simulation[s,runs.inning+1] = run.dist.simulation[s,runs.inning+1] + 1

  }

  run.exp.simulation[s] = weighted.mean(0:(ncol(run.dist.simulation)-1),run.dist.simulation[s,])

}

for (r in 1:24) {

  run.dist.simulation[r,] = run.dist.simulation[r,]/sum(run.dist.simulation[r,])

}

runs = NULL

for (r in 1:ncol(run.dist.simulation)) {

  runs[r] = paste(r-1, 'runs')

}

run.dist.simulation = cbind(run.exp.simulation,run.dist.simulation)

rownames(run.dist.simulation) = bo[1:24]

colnames(run.dist.simulation) = c('re',runs)

##########

# code above is from chancesis

# http://www.chancesis.com/2011/08/14/run-expectancy-and-markov-chains/

##########

# force run distribution chart to go 0-16 runs

 while(ncol(run.dist.simulation)<18) {test.run.dist <- data.frame(cbind(run.dist.simulation,0))}

 

 for(i in 1:nrow(run.dist.simulation)) {run.dist.simulation[i,18] <- sum(run.dist.simulation[i,18:ncol(run.dist.simulation)])}

run.dist.simulation <- run.dist.simulation[,1:18]

 

# user-defined functions

create.run.dist <- function(df1,df2) {

  run.distr <- data.frame()

  for (i in 1:ncol(df1)) {

    run.distr <- rbind(run.distr,df1[1,i]*df2[1,2:18])

  }

  return(run.distr)

}

create.run.dist.2 <- function(df1,df2,h) {

  run.distr <- data.frame()

  for (i in 1:ncol(df1)) {

    run.distr <- rbind(run.distr,df1[1,i]*df2[h,2:18])

  }

  return(run.distr)

}

diag.sum <- function(df) {

        

  runs <- data.frame()

  for (i in 1:ncol(df)) {

    n <- 0

    for (j in 1:i) {

      n <- n+df[j,i+1-j]

    }

    runs[1,i] <- n

  }

  for (i in 1:nrow(df)-1) {

    n <- 0

    for (j in (i+1):nrow(df)) {

      if(ncol(df)+i+1-j>0) {

        n <- n+df[j,ncol(df)+i+1-j]

      }

    }

    runs[1,ncol(df)+i] <- n

  }

  return(runs)

}  

 

  run.dist.2a <- create.run.dist(run.dist.simulation[,2:18],run.dist.simulation)

  run.dist.2 <- diag.sum(run.dist.2a)

   

  run.dist.3a <- create.run.dist(run.dist.2,run.dist.simulation)

  run.dist.3 <- diag.sum(run.dist.3a)

 

  run.dist.4a <- create.run.dist(run.dist.3,run.dist.simulation)

  run.dist.4 <- diag.sum(run.dist.4a)

 

  run.dist.5a <- create.run.dist(run.dist.4,run.dist.simulation)

  run.dist.5 <- diag.sum(run.dist.5a)

  run.dist.6a <- create.run.dist(run.dist.5,run.dist.simulation)

  run.dist.6 <- diag.sum(run.dist.6a)

 

  run.dist.7a <- create.run.dist(run.dist.6,run.dist.simulation)

  run.dist.7 <- diag.sum(run.dist.7a)

 

  run.dist.8a <- create.run.dist(run.dist.7,run.dist.simulation)

  run.dist.8 <- diag.sum(run.dist.8a)

  run.dist.9a <- create.run.dist(run.dist.8,run.dist.simulation)

  run.dist.9 <- diag.sum(run.dist.9a)

 

 

 

#bottom 9

calc.wp.b9 <- function(df) {

  wp.b9 <- data.frame()

  for(h in 1:24) {

    for(i in 1:33) {

      wp <- 0

      diff <- i-17

      for(j in 1:ncol(df)) {

        if(j==18-i) {

          wp <- wp + .5*df[h,j]

        }

        if(j>18-i) {

          wp <- wp + df[h,j]

        }

      }

      wp.b9[h,i] <- wp

    }

  }

  colnames(wp.b9) <- -16:16

  return(wp.b9)

}

#top 9

calc.wp.t9 <- function(df) {

  wp.t9 <- data.frame()

  for(h in 1:24) {

    for(i in 1:33) {

      wp <- 0

      diff <- i-17

      for(j in 1:ncol(df)) {

        ra <- j-1

        for(k in 1:ncol(df)) {

          rs <- k-1

          if((rs+diff)==ra) {

            wp <- wp + .5*df[h,j]*df[1,k]

          }

          if((rs+diff)>ra) {

            wp <- wp + df[h,j]*df[1,k]

          }

        }

      }

      wp.t9[h,i] <- wp

    }

  }

  colnames(wp.t9) <- -16:16

  return(wp.t9)

}

# bottom innings 1-8

calc.wp.bottom <- function(df1,df2,df3) {

  wp.bottom <- data.frame()

  for(h in 1:24) {

    run.dist.a <- create.run.dist.2(df1,df2,h)

    run.dist <- diag.sum(run.dist.a)

    for(i in 1:33) {

      wp <- 0

      diff <- i-17

      for(j in 1:ncol(run.dist)) {

        rs <- j-1

        for(k in 1:ncol(df3)) {

          ra <- k-1

          if((rs+diff)==ra) {

            wp <- wp + .5*run.dist[1,j]*df3[1,k]

          }

          if((rs+diff)>ra) {

            wp <- wp + run.dist[1,j]*df3[1,k]

          }

        }

      }

      wp.bottom[h,i] <- wp

    }

  }

  colnames(wp.bottom) <- -16:16

  return(wp.bottom)

}

#top innings 1-8

calc.wp.top <- function(df1,df2,df3) {

  wp.top <- data.frame()

  for(h in 1:24) {

    run.dist.a <- create.run.dist.2(df1,df2,h)

    run.dist <- diag.sum(run.dist.a)

    for(i in 1:33) {

      wp <- 0

      diff <- i-17

      for(j in 1:ncol(run.dist)) {

        ra <- j-1

        for(k in 1:ncol(df3)) {

          rs <- k-1

          if((rs+diff)==ra) {

            wp <- wp + .5*run.dist[1,j]*df3[1,k]

          }

          if((rs+diff)>ra) {

            wp <- wp + run.dist[1,j]*df3[1,k]

          }

        }

      }

      wp.top[h,i] <- wp

    }

  }

  colnames(wp.top) <- -16:16

  return(wp.top)

}

# calculate win expectency tables by inning

wp.b9 <- calc.wp.b9(run.dist.simulation[,2:18])

wp.b9 <- cbind(inn=9,half="b",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.b9)

wp.t9 <- calc.wp.t9(run.dist.simulation[,2:18])

wp.t9 <- cbind(inn=9,half="t",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.t9)

wp.b8 <- calc.wp.bottom(run.dist.simulation[,2:18],run.dist.simulation,run.dist.simulation[,2:18])

wp.b8 <- cbind(inn=8,half="b",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.b8)

wp.t8 <- calc.wp.top(run.dist.simulation[,2:18],run.dist.simulation,run.dist.2[1:31])

wp.t8 <- cbind(inn=8,half="t",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.t8)

wp.b7 <- calc.wp.bottom(run.dist.2[1:31],run.dist.simulation,run.dist.2[1:31])

wp.b7 <- cbind(inn=7,half="b",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.b7)

wp.t7 <- calc.wp.top(run.dist.2[1:31],run.dist.simulation,run.dist.3[1:31])

wp.t7 <- cbind(inn=7,half="t",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.t7)

wp.b6 <- calc.wp.bottom(run.dist.3[1:31],run.dist.simulation,run.dist.3[1:31])

wp.b6 <- cbind(inn=6,half="b",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.b6)

wp.t6 <- calc.wp.top(run.dist.3[1:31],run.dist.simulation,run.dist.4[1:31])

wp.t6 <- cbind(inn=6,half="t",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.t6)

wp.b5 <- calc.wp.bottom(run.dist.4[1:31],run.dist.simulation,run.dist.4[1:31])

wp.b5 <- cbind(inn=5,half="b",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.b5)

wp.t5 <- calc.wp.top(run.dist.4[1:31],run.dist.simulation,run.dist.5[1:31])

wp.t5 <- cbind(inn=5,half="t",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.t5)

wp.b4 <- calc.wp.bottom(run.dist.5[1:31],run.dist.simulation,run.dist.5[1:31])

wp.b4 <- cbind(inn=4,half="b",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.b4)

wp.t4 <- calc.wp.top(run.dist.5[1:31],run.dist.simulation,run.dist.6[1:31])

wp.t4 <- cbind(inn=4,half="t",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.t4)

wp.b3 <- calc.wp.bottom(run.dist.6[1:31],run.dist.simulation,run.dist.6[1:31])

wp.b3 <- cbind(inn=3,half="b",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.b3)

wp.t3 <- calc.wp.top(run.dist.6[1:31],run.dist.simulation,run.dist.7[1:31])

wp.t3 <- cbind(inn=3,half="t",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.t3)

wp.b2 <- calc.wp.bottom(run.dist.7[1:31],run.dist.simulation,run.dist.7[1:31])

wp.b2 <- cbind(inn=2,half="b",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.b2)

wp.t2 <- calc.wp.top(run.dist.7[1:31],run.dist.simulation,run.dist.8[1:31])

wp.t2 <- cbind(inn=2,half="t",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.t2)

wp.b1 <- calc.wp.bottom(run.dist.8[1:31],run.dist.simulation,run.dist.8[1:31])

wp.b1 <- cbind(inn=1,half="b",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.b1)

wp.t1 <- calc.wp.top(run.dist.8[1:31],run.dist.simulation,run.dist.9[1:31])

wp.t1[,18:33] <- -1

wp.t1 <- cbind(inn=1,half="t",rp9=9*run.dist.simulation[1,1],bo=rownames(run.dist.simulation),wp.t1)

win.expectancy <- data.frame()

win.expectancy <- rbind(wp.t1,wp.b1,wp.t2,wp.b2,wp.t3,wp.b3,wp.t4,wp.b4,wp.t5,wp.b5,wp.t6,wp.b6,wp.t7,wp.b7,wp.t8,wp.b8,wp.t9,wp.b9)

rownames(win.expectancy) <- paste(win.expectancy[,2],win.expectancy[,1],win.expectancy[,4], sep='')

we.rounded <- data.frame(lapply(win.expectancy, function(y) if(is.numeric(y)) round(y, 4) else y))

colnames(we.rounded) <- colnames(win.expectancy)

rownames(we.rounded) <- rownames(win.expectancy)

##################

# Leverage Index #

##################

gs.freq = read.csv("game_state_frequency.csv", header = TRUE)

return.greater.value <- function(a,b) {

  c <- 0

  for(i in 1:length(a)) {

    if(a[i]>b) { c[i] <- a[i] } else { c[i] <- b }

  }

  return(c)

}

return.lesser.value <- function(a,b) {

  c <- 0

  for(i in 1:length(a)) {

    if(a[i] < b) { c[i] <- a[i] } else { c[i] <- b }

  }

  return(c)

}

sum.cols <- function(df) {

  PA <- df$PA

  diff <- abs(as.numeric(df$start.val)-as.numeric(df$end.val))

  weight <- PA*diff

  LI <- data.frame(PA=sum(PA),weight=sum(weight),LI=sum(weight)/sum(PA))

  return(LI)

}

sum.cols.2 <- function(df) {

  PA <- df$PA.y

  weight <- PA*df$LI

  scale <- sum(weight)/sum(PA)

  return(scale)

}

li.top.inning <- function(df,df2,inn) {

  LI <- data.frame()

  end.inn.val <- df2[df2$bo=="b___.o0",]

  wp <- data.frame(cbind(df,b=0:7,o=floor(0:23/8)))

  for(i in 5:37) {

    trans <- merge(bo.trans,data.frame(cbind(start.val=df[,i],b=0:7,o=floor(0:23/8))),by.x=c("START_BASES","START_OUTS"),by.y=c("b","o"))

    event.r <- trans$EVENT_RUNS

    end.b <- trans$END_BASES

    end.o <- trans$END_OUTS

    end.diff <- return.greater.value(i-event.r,5)

    trans$diff<-end.diff

    end.v <- 0

    for(j in 1:nrow(trans)) {

      if(end.o[j]==3) {

        end.value <- end.inn.val[end.diff[j]]

      } else {

        end.value <- wp[wp$b==end.b[j] & wp$o==end.o[j],end.diff[j]]

      }

      end.v[j] <- end.value

    }

  trans$end.val <- end.v

  library(plyr)

  LI.a <- ddply(trans,c("START_BASES","START_OUTS"),sum.cols)

  LI.a$inn <- inn

  LI.a$half <- "t"

  LI.a$diff <- i-21

  LI <- rbind(LI,LI.a)

  }

  return(LI)

}

li.bottom.inning <- function(df,df2,inn) {

  LI <- data.frame()

  end.inn.val <- df2[df2$bo=="b___.o0",]

  wp <- data.frame(cbind(df,b=0:7,o=floor(0:23/8)))

  for(i in 5:37) {

    trans <- merge(bo.trans,data.frame(cbind(start.val=df[,i],b=0:7,o=floor(0:23/8))),by.x=c("START_BASES","START_OUTS"),by.y=c("b","o"))

    event.r <- trans$EVENT_RUNS

    end.b <- trans$END_BASES

    end.o <- trans$END_OUTS

    end.diff <- return.lesser.value(i+event.r,37)

    trans$diff<-end.diff

    end.v <- 0

    for(j in 1:nrow(trans)) {

      if(end.o[j]==3) {

        end.value <- end.inn.val[end.diff[j]]

      } else {

        end.value <- wp[wp$b==end.b[j] & wp$o==end.o[j],end.diff[j]]

      }

      end.v[j] <- end.value

    }

  trans$end.val <- end.v

  library(plyr)

  LI.a <- ddply(trans,c("START_BASES","START_OUTS"),sum.cols)

  LI.a$inn <- inn

  LI.a$half <- "b"

  LI.a$diff <- i-21

  LI <- rbind(LI,LI.a)

  }

  return(LI)

}

li.bottom.9 <- function(df) {

  LI <- data.frame()

#  end.inn.val <- df2[df2$bo=="b___.o0",]

  wp <- data.frame(cbind(df,b=0:7,o=floor(0:23/8)))

  for(i in 5:37) {

    trans <- merge(bo.trans,data.frame(cbind(start.val=df[,i],b=0:7,o=floor(0:23/8))),by.x=c("START_BASES","START_OUTS"),by.y=c("b","o"))

    event.r <- trans$EVENT_RUNS

    end.b <- trans$END_BASES

    end.o <- trans$END_OUTS

    end.diff <- return.lesser.value(i+event.r,37)

    trans$diff<-end.diff

    end.v <- 0

    for(j in 1:nrow(trans)) {

      if(end.o[j]==3) {

        if(end.diff[j]<21) {

          end.value=0

        } else if(end.diff[j]>21) {

          end.value=1

        } else if(end.diff[j]==21) {

          end.value=.5

        }

      } else {

        end.value <- wp[wp$b==end.b[j] & wp$o==end.o[j],end.diff[j]]

      }

      end.v[j] <- end.value

    }

  trans$end.val <- end.v

  library(plyr)

  LI.a <- ddply(trans,c("START_BASES","START_OUTS"),sum.cols)

  LI.a$inn <- 9

  LI.a$half <- "b"

  LI.a$diff <- i-21

  LI <- rbind(LI,LI.a)

  }

  return(LI)

}

li.t1 <- li.top.inning(wp.t1,wp.b1,1)

li.t2 <- li.top.inning(wp.t2,wp.b2,2)

li.t3 <- li.top.inning(wp.t3,wp.b3,3)

li.t4 <- li.top.inning(wp.t4,wp.b4,4)

li.t5 <- li.top.inning(wp.t5,wp.b5,5)

li.t6 <- li.top.inning(wp.t6,wp.b6,6)

li.t7 <- li.top.inning(wp.t7,wp.b7,7)

li.t8 <- li.top.inning(wp.t8,wp.b8,8)

li.t9 <- li.top.inning(wp.t9,wp.b9,9)

li.b1 <- li.bottom.inning(wp.b1,wp.t2,1)

li.b2 <- li.bottom.inning(wp.b2,wp.t3,2)

li.b3 <- li.bottom.inning(wp.b3,wp.t4,3)

li.b4 <- li.bottom.inning(wp.b4,wp.t5,4)

li.b5 <- li.bottom.inning(wp.b5,wp.t6,5)

li.b6 <- li.bottom.inning(wp.b6,wp.t7,6)

li.b7 <- li.bottom.inning(wp.b7,wp.t8,7)

li.b8 <- li.bottom.inning(wp.b8,wp.t9,8)

li.b9 <- li.bottom.9(wp.b9)

leverage.index.a <- rbind(li.t1,li.b1,li.t2,li.b2,li.t3,li.b3,li.t4,li.b4,li.t5,li.b5,li.t6,li.b6,li.t7,li.b7,li.t8,li.b8,li.t9,li.b9)

LI.scale <- merge(leverage.index.a,gs.freq,by=c("START_BASES","START_OUTS","inn","half","diff"))

calc.LI <- function(df,LI.scale) {

  LI <- data.frame()

  scale <- sum.cols.2(LI.scale)

  for (inn in 1:9) {

    for (b in 0:7) {

      for (o in 0:2) {

        LI.a <- c(inn,0,b,o,as.numeric(df[df$START_BASES==b & df$START_OUTS==o & df$inn==inn & df$half=='t',5]/scale))

        LI <- rbind(LI,LI.a)

        LI.b <-c(inn,1,b,o,as.numeric(df[df$START_BASES==b & df$START_OUTS==o & df$inn==inn & df$half=='b',5]/scale))

        LI <- rbind(LI,LI.b)

      }

    }

  }

  colnames(LI) <- c("inn","half","b","o",-16:16)

  return(LI)

}

LI <- calc.LI(leverage.index.a,LI.scale)

write.csv(win.expectancy, file="win expectancy.csv")

write.csv(we.rounded, file="win expectancy rounded.csv")

write.csv(LI, file="leverage index.csv")