My language of choice for supervised and unsupervised learning is Python (and R); however, I am able to run models in Spark (using Scala) for larger projects.
The following scripts can be found on my GitHub account.
This Python script develops multiple bag-of-words models to classify IMDb movie reviews as positive or negative using Naive Bayes and support vector machines.
## This program develops multiple bag-of-words models to classify IMDb movie
## reviews as positive or negative. A sample set of 50k reviews are used to
## train and validate the models (NB and SVM) using both word occurrence and
## TF-IDF algorithms.
# import library
import os
# set working directory
os.chdir('/Users/Bradley/Dropbox/...')
cwd = os.getcwd()
##########
## Read in the data and split into training/test
##########
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
# function --> reads in individual text files in a folder and attaches sentiment
def f_ReadIn(folder,sentiment):
result = []
os.chdir(os.path.join(cwd,folder))
f_cwd = os.getcwd()
for f in os.listdir(f_cwd):
if os.path.isfile(f) and f.endswith(".txt"):
with open(os.path.join(f_cwd,f),'r') as file:
content = file.read()
result.append([content,sentiment])
return pd.DataFrame(result)
# call function f_ReadIn() and combines into one df
neg = f_ReadIn('negative',0)
pos = f_ReadIn('positive',1)
df = pd.concat([neg,pos],ignore_index=True)
df.columns = ['review','sentiment']
df.head()
# free up memory
del neg, pos
# reset working directory
os.chdir(cwd)
# split data into train and test sets --> 70/30
X_train, X_test, y_train, y_test = train_test_split(
df['review'], df['sentiment'], test_size = 0.3
)
##########
## Create vector of features using the count and TF-IDF algorithms
##########
# import libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# count occurrence
c_vectorizer = CountVectorizer(stop_words='english',max_features=1000)
c_train = c_vectorizer.fit_transform(X_train)
c_test= c_vectorizer.transform(X_test)
# TF-IDF
tf_vectorizer = TfidfVectorizer(stop_words='english',max_features=1000)
tf_train = tf_vectorizer.fit_transform(X_train)
tf_test = tf_vectorizer.transform(X_test)
##########
## Build and tune models (Naive Bayes and SVM)
##########
# import libraries
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
# function --> tune NB
def f_tune_nb(X,y):
alphas = [0.1,0.5,1]
param_grid = {'alpha':alphas}
grid_search = GridSearchCV(MultinomialNB(),param_grid,cv=5)
grid_search.fit(X,y)
return grid_search.best_params_
# function --> tune SVM (not feasible on a local computer)
def f_tune_svm(X,y):
costs = [0.1,1,10]
gammas = [0.1,1,4]
param_grid = {'C':costs,'gamma':gammas}
grid_search = GridSearchCV(svm.SVC(),param_grid,cv=5)
grid_search.fit(X,y)
return grid_search.best_params_
# print tuned parameters
print('')
print('c_nb --> %a' % f_tune_nb(c_train,y_train))
print('tf_nb --> %a' % f_tune_nb(tf_train,y_train))
print('c_svm --> %a' % f_tune_svm(c_train,y_train))
print('tf_svm --> %a' % f_tune_svm(tf_train,y_train))
##########
## Run tuned models (Naive Bayes and SVM) and compare results
##########
# NB count with tuned parameter
c_nb = MultinomialNB(alpha=0.1).fit(c_train,y_train)
c_nb_pred = c_nb.predict(c_test)
# NB TF-IDF with tuned parameter
tf_nb = MultinomialNB(alpha=0.5).fit(tf_train,y_train)
tf_nb_pred = tf_nb.predict(tf_test)
# SVM count with tuned parameters
c_svm = svm.SVC(C=1,gamma=0.1)
c_svm.fit(c_train,y_train)
c_svm_pred = c_svm.predict(c_test)
# SVM TF-IDF with tuned parameters
tf_svm = svm.SVC(C=1,gamma=0.1)
tf_svm.fit(tf_train,y_train)
tf_svm_pred = tf_svm.predict(tf_test)
# print model accuracy
print('')
print('Using Count Vectorizer and Naive Bayes')
print('Accuracy: %0.4f' % (accuracy_score(y_test,c_nb_pred)))
print('')
print('Using TF-IDF Vectorizer and Naive Bayes')
print('Accuracy: %0.4f' % (accuracy_score(y_test,tf_nb_pred)))
print('')
print('Using Count Vectorizer and SVM')
print('Accuracy: %0.4f' % (accuracy_score(y_test,c_svm_pred)))
print('')
print('Using TF-IDF Vectorizer and SVM')
print('Accuracy: %0.4f' % (accuracy_score(y_test,tf_svm_pred)))
This R script predicts the overall score assigned to each player in the FIFA 2019 database via random forest regression both with and without boosting.
## This program predicts the overall player score of soccor players in
## the FIFA 2019 database using random forests and boosting.
setwd("/Users/Bradley/Dropbox/...")
## load required libraries
library(randomForest)
library(gbm)
# load and clean data
f_dataAll <- read.csv("fifa_data.csv",header = T)
str(f_dataAll)
f_data <- f_dataAll[,c(8,4,15,27,28,55:(ncol(f_dataAll)-1))]
f_data <- na.omit(f_data)
f_data$Weight <- as.character(f_data$Weight)
f_data$Weight <- as.numeric(substr(f_data$Weight,1,nchar(f_data$Weight)-3))
f_data$Height <- as.character(f_data$Height)
f_data$Height <- (as.numeric(substr(f_data$Height,1,1)) * 12) +
as.numeric(substr(f_data$Height,3,5))
f_data$Preferred.Foot <- droplevels(f_data$Preferred.Foot)
str(f_data)
rows <- nrow(f_data)
# split data --> training and test
split <- sample.int(n=rows,size=floor(0.8*rows),replace=F)
training <- f_data[split, ]
test <- f_data[-split, ]
# plot subset of data to see general correlations
look1 <- test[,1:10]
look2 <- test[,c(1,11:20)]
look3 <- test[,c(1,21:ncol(test))]
pairs(look1)
pairs(look2)
pairs(look3)
############
## Random forests
############
# determine number of trees
rf <- randomForest(Overall~.,data=training)
rf
plot(rf)
which.min(rf$mse)
# choose mtry: # of vars to consider each split
oob.err=double(20)
for(i in 1:20){
fit=randomForest(Overall~.,data=training,mtry=i,ntree=400)
oob.err[i]=fit$mse[400]
cat(i," ")
}
# plot mtry data
plot(1:i,oob.err,pch=19,col="red",type="b",ylab="Mean Squared Error",
xlab="mtry values 1 to 20",main="Out-of-bag Error")
which.min(oob.err)
# validate on test data with tunned parameters
rf_test <- randomForest(Overall~.,data=training,xtest=test[,-1],
ytest=test$Overall,mtry=7,ntree=500,importance=T)
rf_test
# view variable importance
varImpPlot(rf_test)
############
## Boosting
############
# create grid to tune parameters
hyper_grid <- expand.grid(shrinkage=c(.001,.01,.1),interaction.depth=c(1,3,5),
n.minobsinnode=c(10,20))
# grid search
for(i in 1:nrow(hyper_grid)){
# train model
boost.tune <- gbm(Overall~.,data=training,distribution="gaussian",n.trees=10000,
interaction.depth=hyper_grid$interaction.depth[i],
shrinkage=hyper_grid$shrinkage[i],
n.minobsinnode=hyper_grid$n.minobsinnode[i],train.fraction=.75)
# add min training error and trees to grid
hyper_grid$optimal_trees[i] <- which.min(boost.tune$valid.error)
hyper_grid$min_RMSE[i] <- sqrt(min(boost.tune$valid.error))
cat(i," ")
}
# view top models
hyper_grid[order(hyper_grid$min_RMSE),]
# build best model and validate on test data
boost <- gbm(Overall~.,data=training,distribution="gaussian",n.trees=4600,
shrinkage=0.1,interaction.depth=5)
summary(boost)
# plot RMSE results
num.trees=seq(from=100,to=4600,by=100)
test.pred <- predict(boost,newdata=test,n.trees=num.trees)
test.err <- with(test,apply(sqrt((test.pred-Overall)^2),2,mean))
plot(num.trees,test.err,pch=19,col="red",type="b",ylab="Root Mean Squared Error",
xlab="# Trees",main="Boosting Test Error")
# compare best model to test data
which.min(test.err)
min(test.err)
# best model RMSE for test data
test.err[length(num.trees)]
This Spark/Scala script predicts the presence of West Nile virus among misquito populations in Chicago via logistic regression (classification). The data is trained by k-fold cross-validation.
/**
* Logistic regression with k-fold cross-validation using Spark (Scala)
*/
import org.apache.spark.sql.SparkSession
// see less warnings
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)
// start Session
val spark = SparkSession.builder().getOrCreate()
// load West Nile data
val wn_data = (spark.read.option("header","true")
.option("inferSchema","true")
.csv("westnile_data.csv"))
// see data
wn_data.printSchema()
wn_data.head(1)
// create categorical variables
import org.apache.spark.ml.feature.{StringIndexer,OneHotEncoderEstimator}
val transform_data = (new StringIndexer()
.setInputCol("RESULT")
.setOutputCol("resultIndex")
.fit(wn_data)
.transform(wn_data))
val ready_dataAll = transform_data.select(transform_data("resultIndex").as("label"),
$"TRAP_TYPE",$"SPECIES",$"WEEK",$"NUMBER OF MOSQUITOES")
val ready_data = ready_dataAll.na.drop()
val trapIndexer = (new StringIndexer().setInputCol("TRAP_TYPE")
.setOutputCol("traptypeIndex"))
val speciesIndexer = (new StringIndexer().setInputCol("SPECIES")
.setOutputCol("speciesIndex"))
val encoder = (new OneHotEncoderEstimator()
.setInputCols(Array("traptypeIndex","speciesIndex"))
.setOutputCols(Array("traptypeVec","speciesVec")))
// joins multiple feature columns into a single column of an array of feature values
// (label,features)
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
// dependent var must be titled as "label"; the independent vars as "features"
val assemble = (new VectorAssembler()
.setInputCols(Array("traptypeIndex","speciesIndex","NUMBER OF MOSQUITOES"))
.setOutputCol("features"))
// split data
val Array(training, test) = ready_data.randomSplit(Array(0.75, 0.25))
// run k-fold cv for logistic regression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
import org.apache.spark.ml.Pipeline
val lr = new LogisticRegression().setMaxIter(10)
val paramGrid = new ParamGridBuilder().addGrid(lr.regParam,Array(0.1, 0.01)).build()
// cv requires an Estimator, a set of Estimator ParamMaps, and an Evaluator
// 5-fold cv
val cv = (new CrossValidator()
.setEstimator(lr)
.setEvaluator(new BinaryClassificationEvaluator)
.setEstimatorParamMaps(paramGrid)
.setNumFolds(5))
val pipeline = new Pipeline().setStages(Array(trapIndexer,speciesIndexer,
encoder,assemble,cv))
// run cv and choose the best set of parameters.
val cvModel = pipeline.fit(training)
// evaluation --> need to convert to RDD (from df)
import org.apache.spark.mllib.evaluation.MulticlassMetrics
val predictionAndLabels = cvModel.transform(test)
.select($"prediction",$"label")
.as[(Double, Double)]
.rdd
val outcome = new MulticlassMetrics(predictionAndLabels)
// confusion matrix
println("Confusion matrix:")
println(outcome.confusionMatrix)