Commit e24fd8f3 authored by Weigert, Andreas's avatar Weigert, Andreas
Browse files

new Predictive Analytics Tutorial with new structure

parent 6a6abb88
---
title: 'Tutorial 9: Classification'
title: 'Predictive Analyics Case study'
output: html_notebook
editor_options:
chunk_output_type: inline
......
---
title: 'Tutorial 10: Classification'
title: 'Predictive Analyics Case study continued'
output: html_notebook
editor_options:
chunk_output_type: inline
......
---
title: 'Tutorial 9: Classification'
output: html_notebook
editor_options:
chunk_output_type: inline
---
This file is part of the lecture Business Intelligence & Analytics (EESYS-BIA-M), Information Systems and Energy Efficient Systems, University of Bamberg.
```{r Load libraries}
library(FSelector) #for feature selection / you need Java installed to load this package
library(party) #for classification algorithm decision trees
library(class) #for classification algorithm kNN
library(e1071) #for classification algorithm SVM
library(randomForest) #further random forest
```
```{r Load and prepare data}
# Load data
# Derive and investigate the dependent variable "number of residents"
```
```{r Detailed analysis of the independent variables}
# Descriptive analysis of load traces -------------------------------------
# Plot some load curves from households to get familiar with the data
household <- 8
```
```{r Feature extraction}
# Define and implement 10 features from SMD (e.g. mean consumption, mean
# consumption in the evening)
calcFeatures.smd <- function(SMD){
#SMD: the load trace for one week (vector with 672 elements)
#create a matrix with 7 columns for each day
dm15=matrix(as.numeric(SMD),ncol=7)
# define some times
weekday <- 1:(5*4*24)
weekend <-
night <-
morning <-
noon <-
afternoon <-
evening <-
#data.frame for the results
D=data.frame(c_week=mean(dm15, na.rm = T))
#calculate consumption features
D$c_night <- mean(dm15[night, 1:7], na.rm = T)
D$c_morning <- mean()
D$c_noon <- mean()
D$c_afternoon <- mean()
D$c_evening <- mean()
#calculate statistical features
D$s_we_max <- max()
D$s_we_min <- min()
D$s_wd_max <- max()
D$s_wd_min <- min()
#calculate relations
D$r_min_wd_we <- D$s_wd_min / D$s_we_min #division by 0 leads to NaN!
D$r_min_wd_we <- ifelse(is.na(D$r_min_wd_we), 0, D$r_min_wd_we)
D$r_max_wd_we <-
D$r_max_wd_we <-
return(D)
}
#calculate the features for one household
calcFeatures.smd(smd[2,])
features <- calcFeatures.smd(smd[1,])
for(i in 2:nrow(smd)){
features <- rbind(features, calcFeatures.smd(smd[i,]))
}
```
```{r Feature selection}
# Feature filtering -------------------------------------------------------
# Combine all features in one data frame and apply feature selection methods from the FSelector package.
# a) Which features are selected?
# b) Can you explain why those features might be selected?
#combine all datasets
#simple call of the feature selection function
#correlation based filter (2 similar ways to call the method)
#further feature filter
```
```{r Classification Basic evaluation approach}
## decison tree
#train the model
#predict test cases
#create confusion matrix and calculate accuracy
## random forest
#train the model
#predict test cases
#create confusion matrix and calculate accuracy
## kNN
# predict test cases from training data (lazy learning algorithm has no explicit training step!)
#create confusion matrix and calculate accuracy
## SVM
#train the model
#predict the test cases
#create confusion matrix and calculate accuracy
```
---
title: 'Tutorial 10: Classification'
output: html_notebook
editor_options:
chunk_output_type: inline
---
This file is part of the lecture Business Intelligence & Analytics (EESYS-BIA-M), Information Systems and Energy Efficient Systems, University of Bamberg.
```{r Load required packages}
library(FSelector) #for feature selection
library(e1071) #for classification algorithm SVM
library(randomForest) #further random forest
library(ROCR) #for illustration of classification performance
library(dplyr) #for data wrangling
```
```{r Load and prepare data}
# Load data
load("../../data/classification2.RData")
# Derive and investigate the dependent variable "number of residents"
adults <- as.integer(ifelse(customers$residents.numAdult=="5 oder mehr",
"5",customers$residents.numAdult))
children <- as.integer(ifelse(customers$residents.numChildren=="5 oder mehr",
"5",customers$residents.numChildren))
table(ifelse(is.na(children), adults, adults+children))
# think in classes. we have some very rare classes of number of residents (>5)
# Prepare pNumResidents to have multiple classes
customers$pNumResidents <- sapply(ifelse(is.na(children), adults, adults+children),
function(a) {
if(a==0 || is.na(a)){
return(NA)
} else if(a==1){
return("1 person")
} else if(a==2){
return("2 persons")
} else if(a<=5){
return("3-5 persons")
} else {
return(">5 persons")
}
})
customers$pNumResidents <- ordered(customers$pNumResidents,
levels=c("1 person", "2 persons",
"3-5 persons", ">5 persons"))
table(customers$pNumResidents)
# Prepare pNumResidents to have two (binary) classes
customers$pNumResidents2 <- sapply(ifelse(is.na(children), adults, adults+children),
function(a) {
if(a==0 || is.na(a)){
return(NA)
} else if(a<=2){
return("1-2 persons")
} else {
return(">2 persons")
}
})
customers$pNumResidents2 <- ordered(customers$pNumResidents2,
levels=c("1-2 persons", ">2 persons"))
table(customers$pNumResidents2)
all_data <- left_join(all_data, customers %>% select(VID, pNumResidents, pNumResidents2, housing.type), by="VID")
```
```{r Classification with probabilities - binary class}
selected.features <- c("c_week","c_morning","c_noon","c_afternoon","c_evening","s_we_max","s_we_min","s_wd_max")
set.seed(1506)
test.cases <- match(
sample(all_data$VID, size = .3*nrow(all_data)),
all_data$VID)
all_data
testdata <- na.omit(all_data[test.cases, c("pNumResidents2", selected.features)])
traindata <- na.omit(all_data[-test.cases, c("pNumResidents2", selected.features)])
## SVM with probabilities -------
model <- svm(pNumResidents2 ~ . , data=traindata, probability = T)
clres <- predict(model, newdata=testdata, probability = T)
cm <- table(clres, testdata$pNumResidents2)
(accuracy <- (sum(diag(cm))/sum(as.vector(cm))))
#for ROC curve, the probabilities as estimated by the classifier are needed
propabilities <- attributes(clres)$probabilities[,1]
# FROM HERE ON EXERCISE
```
```{r Classification with probabilities - multiclass}
testdata <- na.omit(all_data[test.cases, c("pNumResidents", selected.features)])
traindata <- na.omit(all_data[-test.cases, c("pNumResidents", selected.features)])
model <- svm(pNumResidents ~ . , data=traindata, probability = T)
clres <- predict(model, newdata=testdata, probability = T)
cm <- table(clres, testdata$pNumResidents)
(accuracy <- (sum(diag(cm))/sum(as.vector(cm))))
#for ROC curve, the probabilities as estimated by the classifier are needed
propabilities <- attributes(clres)$probabilities
# FROM HERE ON EXERCISE
```
```{r Classification with an advanced evaluation technique: cross-validation}
set.seed(1506)
folds <- 10
all_data$crossfolds <- sample(1:folds, nrow(all_data), replace = TRUE)
# list for the interim results
results <- list()
for(foldIndex in 1:folds){
# creating data for the
testdata <- na.omit(all_data[all_data$crossfolds==foldIndex, c("pNumResidents2", selected.features)])
traindata <- na.omit(all_data[all_data$crossfolds!=foldIndex, c("pNumResidents2", selected.features)])
model <- svm(pNumResidents2 ~ . , data=traindata, probability = T)
results[[foldIndex]] <- list()
results[[foldIndex]]$model <- model
clres <- predict(model, newdata=testdata, probability = T)
results[[foldIndex]]$probs <- attributes(clres)$probabilities
cm <- table(clres, testdata$pNumResidents2)
results[[foldIndex]]$cm <- cm
results[[foldIndex]]$accuracy <- (accuracy <- (sum(diag(cm))/sum(as.vector(cm))))
results[[foldIndex]]$labels <- testdata$pNumResidents2
}
# FROM HERE ON EXERCISE
```
......@@ -156,7 +156,7 @@ nl_mailsSend %>%
nl_mailsSend %>%
left_join(nl_links, by="EmailID") %>%
left_join(nl_clicks, by="LinkID") %>%
left_join(nl_clicks, by=c("LinkID"="LinkID", "URL"="URL")) %>%
group_by(EmailID) %>%
#this is pretty much the same as above
summarise(title = first(NewsletterTitle),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment