### new Predictive Analytics Tutorial with new structure

parent 6a6abb88
 --- title: 'Tutorial 9: Classification' title: 'Predictive Analyics Case study' output: html_notebook editor_options: chunk_output_type: inline ... ...
 --- title: 'Tutorial 10: Classification' title: 'Predictive Analyics Case study continued' output: html_notebook editor_options: chunk_output_type: inline ... ...
 --- title: 'Tutorial 9: Classification' output: html_notebook editor_options: chunk_output_type: inline --- This file is part of the lecture Business Intelligence & Analytics (EESYS-BIA-M), Information Systems and Energy Efficient Systems, University of Bamberg. ```{r Load libraries} library(FSelector) #for feature selection / you need Java installed to load this package library(party) #for classification algorithm decision trees library(class) #for classification algorithm kNN library(e1071) #for classification algorithm SVM library(randomForest) #further random forest ``` ```{r Load and prepare data} # Load data # Derive and investigate the dependent variable "number of residents" ``` ```{r Detailed analysis of the independent variables} # Descriptive analysis of load traces ------------------------------------- # Plot some load curves from households to get familiar with the data household <- 8 ``` ```{r Feature extraction} # Define and implement 10 features from SMD (e.g. mean consumption, mean # consumption in the evening) calcFeatures.smd <- function(SMD){ #SMD: the load trace for one week (vector with 672 elements) #create a matrix with 7 columns for each day dm15=matrix(as.numeric(SMD),ncol=7) # define some times weekday <- 1:(5*4*24) weekend <- night <- morning <- noon <- afternoon <- evening <- #data.frame for the results D=data.frame(c_week=mean(dm15, na.rm = T)) #calculate consumption features D\$c_night <- mean(dm15[night, 1:7], na.rm = T) D\$c_morning <- mean() D\$c_noon <- mean() D\$c_afternoon <- mean() D\$c_evening <- mean() #calculate statistical features D\$s_we_max <- max() D\$s_we_min <- min() D\$s_wd_max <- max() D\$s_wd_min <- min() #calculate relations D\$r_min_wd_we <- D\$s_wd_min / D\$s_we_min #division by 0 leads to NaN! D\$r_min_wd_we <- ifelse(is.na(D\$r_min_wd_we), 0, D\$r_min_wd_we) D\$r_max_wd_we <- D\$r_max_wd_we <- return(D) } #calculate the features for one household calcFeatures.smd(smd[2,]) features <- calcFeatures.smd(smd[1,]) for(i in 2:nrow(smd)){ features <- rbind(features, calcFeatures.smd(smd[i,])) } ``` ```{r Feature selection} # Feature filtering ------------------------------------------------------- # Combine all features in one data frame and apply feature selection methods from the FSelector package. # a) Which features are selected? # b) Can you explain why those features might be selected? #combine all datasets #simple call of the feature selection function #correlation based filter (2 similar ways to call the method) #further feature filter ``` ```{r Classification Basic evaluation approach} ## decison tree #train the model #predict test cases #create confusion matrix and calculate accuracy ## random forest #train the model #predict test cases #create confusion matrix and calculate accuracy ## kNN # predict test cases from training data (lazy learning algorithm has no explicit training step!) #create confusion matrix and calculate accuracy ## SVM #train the model #predict the test cases #create confusion matrix and calculate accuracy ```
 --- title: 'Tutorial 10: Classification' output: html_notebook editor_options: chunk_output_type: inline --- This file is part of the lecture Business Intelligence & Analytics (EESYS-BIA-M), Information Systems and Energy Efficient Systems, University of Bamberg. ```{r Load required packages} library(FSelector) #for feature selection library(e1071) #for classification algorithm SVM library(randomForest) #further random forest library(ROCR) #for illustration of classification performance library(dplyr) #for data wrangling ``` ```{r Load and prepare data} # Load data load("../../data/classification2.RData") # Derive and investigate the dependent variable "number of residents" adults <- as.integer(ifelse(customers\$residents.numAdult=="5 oder mehr", "5",customers\$residents.numAdult)) children <- as.integer(ifelse(customers\$residents.numChildren=="5 oder mehr", "5",customers\$residents.numChildren)) table(ifelse(is.na(children), adults, adults+children)) # think in classes. we have some very rare classes of number of residents (>5) # Prepare pNumResidents to have multiple classes customers\$pNumResidents <- sapply(ifelse(is.na(children), adults, adults+children), function(a) { if(a==0 || is.na(a)){ return(NA) } else if(a==1){ return("1 person") } else if(a==2){ return("2 persons") } else if(a<=5){ return("3-5 persons") } else { return(">5 persons") } }) customers\$pNumResidents <- ordered(customers\$pNumResidents, levels=c("1 person", "2 persons", "3-5 persons", ">5 persons")) table(customers\$pNumResidents) # Prepare pNumResidents to have two (binary) classes customers\$pNumResidents2 <- sapply(ifelse(is.na(children), adults, adults+children), function(a) { if(a==0 || is.na(a)){ return(NA) } else if(a<=2){ return("1-2 persons") } else { return(">2 persons") } }) customers\$pNumResidents2 <- ordered(customers\$pNumResidents2, levels=c("1-2 persons", ">2 persons")) table(customers\$pNumResidents2) all_data <- left_join(all_data, customers %>% select(VID, pNumResidents, pNumResidents2, housing.type), by="VID") ``` ```{r Classification with probabilities - binary class} selected.features <- c("c_week","c_morning","c_noon","c_afternoon","c_evening","s_we_max","s_we_min","s_wd_max") set.seed(1506) test.cases <- match( sample(all_data\$VID, size = .3*nrow(all_data)), all_data\$VID) all_data testdata <- na.omit(all_data[test.cases, c("pNumResidents2", selected.features)]) traindata <- na.omit(all_data[-test.cases, c("pNumResidents2", selected.features)]) ## SVM with probabilities ------- model <- svm(pNumResidents2 ~ . , data=traindata, probability = T) clres <- predict(model, newdata=testdata, probability = T) cm <- table(clres, testdata\$pNumResidents2) (accuracy <- (sum(diag(cm))/sum(as.vector(cm)))) #for ROC curve, the probabilities as estimated by the classifier are needed propabilities <- attributes(clres)\$probabilities[,1] # FROM HERE ON EXERCISE ``` ```{r Classification with probabilities - multiclass} testdata <- na.omit(all_data[test.cases, c("pNumResidents", selected.features)]) traindata <- na.omit(all_data[-test.cases, c("pNumResidents", selected.features)]) model <- svm(pNumResidents ~ . , data=traindata, probability = T) clres <- predict(model, newdata=testdata, probability = T) cm <- table(clres, testdata\$pNumResidents) (accuracy <- (sum(diag(cm))/sum(as.vector(cm)))) #for ROC curve, the probabilities as estimated by the classifier are needed propabilities <- attributes(clres)\$probabilities # FROM HERE ON EXERCISE ``` ```{r Classification with an advanced evaluation technique: cross-validation} set.seed(1506) folds <- 10 all_data\$crossfolds <- sample(1:folds, nrow(all_data), replace = TRUE) # list for the interim results results <- list() for(foldIndex in 1:folds){ # creating data for the testdata <- na.omit(all_data[all_data\$crossfolds==foldIndex, c("pNumResidents2", selected.features)]) traindata <- na.omit(all_data[all_data\$crossfolds!=foldIndex, c("pNumResidents2", selected.features)]) model <- svm(pNumResidents2 ~ . , data=traindata, probability = T) results[[foldIndex]] <- list() results[[foldIndex]]\$model <- model clres <- predict(model, newdata=testdata, probability = T) results[[foldIndex]]\$probs <- attributes(clres)\$probabilities cm <- table(clres, testdata\$pNumResidents2) results[[foldIndex]]\$cm <- cm results[[foldIndex]]\$accuracy <- (accuracy <- (sum(diag(cm))/sum(as.vector(cm)))) results[[foldIndex]]\$labels <- testdata\$pNumResidents2 } # FROM HERE ON EXERCISE ```
 ... ... @@ -156,7 +156,7 @@ nl_mailsSend %>% nl_mailsSend %>% left_join(nl_links, by="EmailID") %>% left_join(nl_clicks, by="LinkID") %>% left_join(nl_clicks, by=c("LinkID"="LinkID", "URL"="URL")) %>% group_by(EmailID) %>% #this is pretty much the same as above summarise(title = first(NewsletterTitle), ... ...
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment