BIA_T10_Classification2.Rmd 6.12 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
---
title:  'Tutorial 9: Classification'
output: html_notebook
editor_options: 
  chunk_output_type: inline
---

This file is part of the lecture Business Intelligence & Analytics (EESYS-BIA-M), Information Systems and Energy Efficient Systems, University of Bamberg.


```{r Load required packages}
library(FSelector) #for feature selection
library(e1071) #for classification algorithm SVM
library(randomForest) #further random forest
library(ROCR) #for illustration of classification performance
library(dplyr) #for data wrangling
```


```{r Load and prepare data}
# Load data
22
load("../../data/classification2.RData")
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109

# Derive and investigate the dependent variable "number of residents"
adults <- as.integer(ifelse(customers$residents.numAdult=="5 oder mehr",
                            "5",customers$residents.numAdult))
children <- as.integer(ifelse(customers$residents.numChildren=="5 oder mehr",
                              "5",customers$residents.numChildren))

table(ifelse(is.na(children), adults, adults+children))
# think in classes. we have some very rare classes of number of residents (>5)

# Prepare pNumResidents to have multiple classes
customers$pNumResidents <- sapply(ifelse(is.na(children), adults, adults+children), 
                                       function(a) {
  if(a==0 || is.na(a)){
    return(NA)
  } else if(a==1){
    return("1 person")
  } else if(a==2){
    return("2 persons")
  } else if(a<=5){
    return("3-5 persons")
  } else {
    return(">5 persons")
  }
})

customers$pNumResidents <- ordered(customers$pNumResidents, 
                                      levels=c("1 person", "2 persons", 
                                               "3-5 persons", ">5 persons"))
table(customers$pNumResidents)


# Prepare pNumResidents to have two (binary) classes
customers$pNumResidents2 <- sapply(ifelse(is.na(children), adults, adults+children), 
                                       function(a) {
  if(a==0 || is.na(a)){
    return(NA)
  } else if(a<=2){
    return("1-2 persons")
  } else {
    return(">2 persons")
  }
})

customers$pNumResidents2 <- ordered(customers$pNumResidents2, 
                                      levels=c("1-2 persons", ">2 persons"))
table(customers$pNumResidents2)

all_data <- left_join(all_data, customers %>% select(VID, pNumResidents, pNumResidents2, housing.type), by="VID")
```


```{r Classification with probabilities - binary class}
selected.features <- c("c_week","c_morning","c_noon","c_afternoon","c_evening","s_we_max","s_we_min","s_wd_max")

set.seed(1506)
test.cases <- match(
  sample(all_data$VID, size = .3*nrow(all_data)), 
  all_data$VID)

all_data

testdata <- na.omit(all_data[test.cases, c("pNumResidents2", selected.features)])
traindata <- na.omit(all_data[-test.cases, c("pNumResidents2", selected.features)])


## SVM with probabilities -------

model <- svm(pNumResidents2 ~ . , data=traindata, probability = T)
clres <- predict(model, newdata=testdata, probability = T)

cm <- table(clres, testdata$pNumResidents2)
(accuracy <- (sum(diag(cm))/sum(as.vector(cm))))

#for ROC curve, the probabilities as estimated by the classifier are needed
propabilities <- attributes(clres)$probabilities[,1]

# FROM HERE ON EXERCISE

pred <- prediction(predictions = propabilities, labels = testdata$pNumResidents2)
performance(pred, "auc")

#the ROC curve shows true-positive-rate vs. false-positive rate
plot(performance(pred, "tpr", "fpr"))
```


110

111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
```{r Classification with probabilities - multiclass}
testdata <- na.omit(all_data[test.cases, c("pNumResidents", selected.features)])
traindata <- na.omit(all_data[-test.cases, c("pNumResidents", selected.features)])

model <- svm(pNumResidents ~ . , data=traindata, probability = T)
clres <- predict(model, newdata=testdata, probability = T)

cm <- table(clres, testdata$pNumResidents)
(accuracy <- (sum(diag(cm))/sum(as.vector(cm))))

#for ROC curve, the probabilities as estimated by the classifier are needed
propabilities <- attributes(clres)$probabilities


# FROM HERE ON EXERCISE

# It's a multiclass problem but We are only interested in the class "1 person"
pred <- prediction(predictions = propabilities[,"1 person"], labels = testdata$pNumResidents == "1 person")

#the area under ROC curve is a good evaluation criterion
performance(pred, "auc")

#the ROC curve shows true-positive-rate vs. false-positive rate
plot(performance(pred, "tpr", "fpr"))
```
136
137
138
139


```{r Classification with an advanced evaluation technique: cross-validation}
set.seed(1506)
Weigert, Andreas's avatar
Weigert, Andreas committed
140
141
142
143

folds <- 10

all_data$crossfolds <- sample(1:folds, nrow(all_data), replace = TRUE)
144
145
146
147

# list for the interim results
results <- list()

Weigert, Andreas's avatar
Weigert, Andreas committed
148
for(foldIndex in 1:folds){
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
  # creating data for the 
  testdata <- na.omit(all_data[all_data$crossfolds==foldIndex, c("pNumResidents2", selected.features)])
  traindata <- na.omit(all_data[all_data$crossfolds!=foldIndex, c("pNumResidents2", selected.features)])
  
  model <- svm(pNumResidents2 ~ . , data=traindata, probability = T)
  results[[foldIndex]] <- list()
  results[[foldIndex]]$model <- model
  
  clres <- predict(model, newdata=testdata, probability = T)
  results[[foldIndex]]$probs <- attributes(clres)$probabilities

  cm <- table(clres, testdata$pNumResidents2)
  results[[foldIndex]]$cm <- cm
  
  results[[foldIndex]]$accuracy <- (accuracy <- (sum(diag(cm))/sum(as.vector(cm))))
  
  results[[foldIndex]]$labels <- testdata$pNumResidents2
  
}

# FROM HERE ON EXERCISE

# calculate the average accuracy
accuracy_array <- simplify2array(lapply(X = results, FUN = function(i) i$accuracy))
accuracy_mean <- mean(accuracy_array)
accuracy_mean


# print the different roc curves for each model and calculate the average auc, your target class is ">2 persons"
propabilities_list <- lapply(X = results, FUN = function(i) i$probs[,">2 persons"])
label_list <- lapply(X = results, FUN = function(i) i$labels)

pred <- prediction(predictions = propabilities_list, labels = label_list)

auc_array <- unlist(performance(pred, "auc")@y.values)
auc_mean <- mean(auc_array)
auc_mean

#the ROC curve shows true-positive-rate vs. false-positive rate
plot(performance(pred, "tpr", "fpr"))
Weigert, Andreas's avatar
Weigert, Andreas committed
189
190
191
192
193


# Test if the AUCs are expexted to be greater than random (AUC=0.5)
t.test(auc_array, mu = 0.5, alternative = "greater")

194
195
```