Programming

My language of choice for scripting and analysis is Python (especially for end-to-end pipelines); however, I have used and am familiar with many other statistical/scripting languages and programs (with R being my bread and butter in graduate school). Two of my favorite, yet somewhat tangential, R packages are progress and tikzDevice.

Projects

The following projects can be found on my GitHub account.

Code snippets

This R script fits data to a lognormal distribution and then samples from the fitted distribution to find the first three moments (and the median).

## This program fits a log-normal distribution to sample data based on outcomes 
## and quantiles. Simulates two-state models through 10,000 samples. Computes
## the mean, median, variance, and skewness.

setwd("/Users/Bradley/Dropbox/...")

# load required packages
library(progress)
library(rriskDistributions)
library(moments)

# load data
sampleData <- read.csv("meanVarData.csv",header=TRUE)
closure <- c(.2556,1-.2556) # closed, open
quantilesNear <- c(0.2,0.5,0.8)
obs <- nrow(sampleData)

# fits 3 quantiles to log-normal distribution
fitLog <- function(info,numObs,quant,tolerance){
  output <- matrix(NA,numObs,2)
  for(i in 1:numObs){
    temp <- get.lnorm.par(p=quant,q=c(info[i,1],info[i,2],info[i,3])
                          ,show.output=F,plot=F,tol=tolerance)
    output[i,1] <- temp[[1]]
    if(is.na(output[i,1])){
      output[i,2] <- NA
    }
    else{
      output[i,2] <- temp[[2]]
    }
  }
  output
}

# returns which observations could not be fit to a log-normal distribution
# i.e., does not meet tolerance level
meetTol <- function(params){
  output <- c()
  for(i in 1:obs){
    if(is.na(params[i])){
      output <- c(output,i)
    }
  }
  output
}

log1 <- fitLog(two[,2:4],obs,quantilesNear,0.01)
log2 <- fitLog(two[,5:7],obs,quantilesNear,0.01)

meetTol(log1)
meetTol(log2)

# drop the invalid observations for the whole dataset
drop <- c(meetTol(log1),meetTol(log2))
log1Drop <- log1[-drop,]
log2Drop <- log2[-drop,]
sampleDataDrop <- sampleData[-drop,]
obs <- nrow(routes)

# simulates log-normal samples for a two-state model
# and outputs the first 3 moments and the median
sampleDistPass <- function(logDroppedOpen,logDroppedClosed,size){
  output <- matrix(NA,obs,4)
  pb <- progress_bar$new(total = obs)
  for(i in 1:obs){
    hold <- vector(mode="numeric",length=size)
    for(j in 1:size){
      rv <- runif(1,0,1)
      if(rv<closure[2]){
        hold[j] <- rlnorm(1,logDroppedOpen[i,1],logDroppedOpen[i,2])
      }
      else{
        hold[j] <- rlnorm(1,logDroppedClosed[i,1],logDroppedClosed[i,2])
      }
    }
    output[i,1] <- mean(hold)
    output[i,2] <- median(hold)
    output[i,3] <- var(hold)
    output[i,4] <- skewness(hold)
  
    pb$tick()
  }
  output
}

results <- sampleDistPass(log1Drop,log2Drop,10000)

This .do file [Stata] merges datasets together to create a contiguity weighting matrix used in spatial analysis.

// This script creates a contiguity weighting matrix by
// merging the original dataset with the US Census TIGER
// shapefile

import delimited "/Users/Bradley/Dropbox/..."

//  unzip TIGER file and generate fips pre merge
copy ~/Downloads/tl_2016_us_county.zip .
unzipfile tl_2016_us_county.zip
spshape2dta tl_2016_us_county
use tl_2016_us_county, clear
describe
list in 1/2
generate long fips = real(STATEFP + COUNTYFP)
bysort fips: assert _N==1
assert fips != .
spset fips, modify replace
list in 1/2
save, replace

// format original data pre merge
format year %tq
assert fips!=.
assert year!=.
bysort fips year: assert _N==1
xtset, clear
xtset fips year
spbalance
spset fips
save "/Users/Bradley/Dropbox/..."

// merge orignial data with TIGER by fips
use dataA, clear
describe
merge m:1 fips using tl_2016_us_county
keep if _merge==3
drop _merge
drop STATEFP COUNTYFP COUNTYNS GEOID NAME NAMELSAD LSAD CLASSFP
drop MTFCC CSAFP CBSAFP METDIVFP FUNCSTAT ALAND AWATER INTPTLAT INTPTLON
save "/Users/Bradley/Dropbox/..."

// create contiguity matrix
spmatrix create contiguity W if year == 2007

This R script calls the Google Maps Distance Matrix API to return the duration from Snoqualmie Pass to each reported destination over the 3 potential reroute options. Values returned by the Google Maps API include 3 different durations: pessimistic guess, best guess, and optimistic guess. The Google Maps API is called over 10,000 times.

## This program finds the duration, in minutes, to all reported I-90 
## destinations from Snoqualmie Pass over three alternatives (2, 12, 84)
## given a departure time via the Google Maps Distance Matrix API

setwd("/Users/Bradley/Dropbox/...")

library(bitops)
library(httr)
library(XML)
library(RCurl)
library(rJava)
library(xlsxjars)
library(xlsx)
library(progress)

# read in data
input <- read.csv("inputRoutes.csv",header=F)
allData <- as.matrix(input)
colnames(allData) <- NULL
numRow <- nrow(allData)
penalty <- 10050
snoqPass <- "SNOQUALMIE+PASS+WA"
stevPass <- "STEVENS+PASS+WA"
drive <- c("pessimistic","best_guess","optimistic")
key <- "sign_up_for_a_key"

# routes from Snoqualmie Pass over wavepoints to final destination
f.wavepoint <- function(wavepointA,wavepointB,speed){
  results.A <- matrix(nrow=numRow,ncol=1)
  results.B <- matrix(nrow=numRow,ncol=1)
  results.C <- matrix(nrow=numRow,ncol=1)
  results.out <- matrix(nrow=numRow,ncol=1)
  pb <- progress_bar$new(total = numRow)
  for(i in 1:numRow){
    tempTime <- 0
    
    # routes from SP to first wavepoint
    k <- 0
    while(is.na(results.A[i]) && k<10){
      if(allData[i,1] == 1){
        url <- paste0('https://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
                      snoqPass,'&destinations=',wavepointA,
                      '&units=imperial&departure_time=',allData[i,4],'&traffic_model=',
                      speed,'&key=',key)
      }
      else{
        url <- paste0('https://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
                      snoqPass,'&destinations=',wavepointB,
                      '&units=imperial&departure_time=',allData[i,4],'&traffic_model=',
                      speed,'&key=',key)
      }
      Sys.sleep(0.3)
      tie <- xmlParse(GET(url))
      tryCatch(results.A[i] <- as.numeric(xpathApply(tie,"//duration_in_traffic/value",xmlValue)),
               error=function(e) NULL)
      k <- k+1
    }
    tempTime <- as.numeric(allData[i,4]) + results.A[i]
    
    # routes from first wavepoint to second wavepoint
    k <- 0
    while(is.na(results.B[i]) && k<10){
      if(allData[i,1] == 1){
        url <- paste0('https://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
                      wavepointA,'&destinations=',wavepointB,
                      '&units=imperial&departure_time=',tempTime,'&traffic_model=',
                      speed,'&key=',key)
      }
      else{
        url <- paste0('https://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
                      wavepointB,'&destinations=',wavepointA,
                      '&units=imperial&departure_time=',tempTime,'&traffic_model=',
                      speed,'&key=',key)
      }
      Sys.sleep(0.3)
      tie <- xmlParse(GET(url))
      tryCatch(results.B[i] <- as.numeric(xpathApply(tie,"//duration_in_traffic/value",xmlValue)),
               error=function(e) NULL)
      k <- k+1
    }
    tempTime <- tempTime + results.B[i]
    
    # routes from second wavepoint to final destination
    k <- 0
    while(is.na(results.C[i]) && k<10){
      if(allData[i,1] == 1){
        url <- paste0('https://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
                      wavepointB,'&destinations=',allData[i,3],
                      '&units=imperial&departure_time=',tempTime,'&traffic_model='
                      ,speed,'&key=',key)
      }
      else{
        url <- paste0('https://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
                      wavepointA,'&destinations=',allData[i,3],
                      '&units=imperial&departure_time=',tempTime,'&traffic_model=',
                      speed,'&key=',key)
      }
      Sys.sleep(0.3)
      tie <- xmlParse(GET(url))
      tryCatch(results.C[i] <- as.numeric(xpathApply(tie,"//duration_in_traffic/value",xmlValue)),
               error=function(e) NULL)
      k <- k+1
    }
    
    # output
    results.out[i] <- (results.A[i] + results.B[i] + results.C[i])/60
    pb$tick()
  }
  results.out
}

# routes from Snoqualmie Pass over wavepoints to final destination
f.wavepointPLUS <- function(wavepointA,wavepointB,speed){
  results.A <- matrix(nrow=numRow,ncol=1)
  results.B <- matrix(nrow=numRow,ncol=1)
  results.C <- matrix(nrow=numRow,ncol=1)
  results.D <- matrix(nrow=numRow,ncol=1)
  results.out <- matrix(nrow=numRow,ncol=1)
  pb <- progress_bar$new(total = numRow)
  for(i in 1:numRow){
    
    # routes from SP to first wavepoint
    k <- 0
    while(is.na(results.A[i]) && k<10){
      if(allData[i,1] == 1){
        url <- paste0('https://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
                      snoqPass,'&destinations=',wavepointA,
                      '&units=imperial&departure_time=',allData[i,4],'&traffic_model=',
                      speed,'&key=',key)
      }
      else{
        url <- paste0('https://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
                      snoqPass,'&destinations=',wavepointB,
                      '&units=imperial&departure_time=',allData[i,4],'&traffic_model=',
                      speed,'&key=',key)
      }
      Sys.sleep(0.3)
      tie <- xmlParse(GET(url))
      tryCatch(results.A[i] <- as.numeric(xpathApply(tie,"//duration_in_traffic/value",xmlValue)),
               error=function(e) NULL)
      k <- k+1
    }
    tempTime <- as.numeric(allData[i,4]) + results.A[i]
    
    # routes from first wavepoint to Stevens Pass
    k <- 0
    while(is.na(results.B[i]) && k<10){
      if(allData[i,1] == 1){
        url <- paste0('https://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
                      wavepointA,'&destinations=',stevPass,
                      '&units=imperial&departure_time=',tempTime,'&traffic_model=',
                      speed,'&key=',key)
      }
      else{
        url <- paste0('https://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
                      wavepointB,'&destinations=',stevPass,
                      '&units=imperial&departure_time=',tempTime,'&traffic_model=',
                      speed,'&key=',key)
      }
      Sys.sleep(0.3)
      tie <- xmlParse(GET(url))
      tryCatch(results.B[i] <- as.numeric(xpathApply(tie,"//duration_in_traffic/value",xmlValue)),
               error=function(e) NULL)
      k <- k+1
    }
    tempTime <- tempTime + results.B[i] + penalty
    
    # routes from Stevens Pass to second wavepoint
    k <- 0
    while(is.na(results.C[i]) && k<10){
      if(allData[i,1] == 1){
        url <- paste0('https://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
                      stevPass,'&destinations=',wavepointB,
                      '&units=imperial&departure_time=',tempTime,'&traffic_model=',
                      speed,'&key=',key)
      }
      else{
        url <- paste0('https://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
                      stevPass,'&destinations=',wavepointA,
                      '&units=imperial&departure_time=',tempTime,'&traffic_model=',
                      speed,'&key=',key)
      }
      Sys.sleep(0.3)
      tie <- xmlParse(GET(url))
      tryCatch(results.C[i] <- as.numeric(xpathApply(tie,"//duration_in_traffic/value",xmlValue)),
               error=function(e) NULL)
      k <- k+1
    }
    tempTime <- tempTime + results.C[i]
    
    # routes from second wavepoint to final destination
    k <- 0
    while(is.na(results.D[i]) && k<10){
      if(allData[i,1] == 1){
        url <- paste0('https://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
                      wavepointB,'&destinations=',allData[i,3],
                      '&units=imperial&departure_time=',tempTime,'&traffic_model=',
                      speed,'&key=',key)
      }
      else{
        url <- paste0('https://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
                      wavepointA,'&destinations=',allData[i,3],
                      '&units=imperial&departure_time=',tempTime,'&traffic_model=',
                      speed,'&key=',key)
                      
      }
      Sys.sleep(0.3)
      tie <- xmlParse(GET(url))
      tryCatch(results.D[i] <- as.numeric(xpathApply(tie,"//duration_in_traffic/value",xmlValue)),
               error=function(e) NULL)
      k <- k+1
    }
    
    results.out[i] <- (results.A[i]+results.B[i]+penalty+results.C[i]+results.D[i])/60
    pb$tick()
  }
  results.out
}


for(j in 1:3){
  route2 <- f.wavepoint("Leavenworth+WA","Sultan+WA",drive[j])
  route2plus <- f.wavepointPLUS("Leavenworth+WA","Sultan+WA",drive[j])
  route12 <- f.wavepoint("Gleed+WA","Fern+Gap+WA",drive[j])
  route84 <- f.wavepoint("Goldendale+WA","Rooster+Rock+Park+OR",drive[j])
  
  altRoutes <- cbind(allData,route2,route2plus,route12,route84)
  colnames(altRoutes) <- c("west","origin","destination","epochtime",
                           "route2","route2plus","route12","route84")
  write.xlsx(altRoutes,file="altRoutes.xlsx",sheetName=drive[j],
             col.names=T,row.names=F,append=T)
  
  route2 <- NULL
  route2plus <- NULL
  route12 <- NULL
  route84 <- NULL
}