Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
eesys-public
BIA-Resources
Commits
b270aa49
Commit
b270aa49
authored
Jan 07, 2019
by
Weigert, Andreas
Browse files
added solution for Tutorial 10, Task 1-2
parent
91c594e0
Changes
1
Hide whitespace changes
Inline
Side-by-side
R/BIA_T10_Classification2.Rmd
0 → 100644
View file @
b270aa49
---
title
:
'Tutorial 9: Classification'
output
:
html_notebook
editor_options
:
chunk_output_type
:
inline
---
This
file
is
part
of
the
lecture
Business
Intelligence
&
Analytics
(
EESYS
-
BIA
-
M
),
Information
Systems
and
Energy
Efficient
Systems
,
University
of
Bamberg
.
```{
r
Load
required
packages
}
library
(
FSelector
)
#
for
feature
selection
library
(
e1071
)
#
for
classification
algorithm
SVM
library
(
randomForest
)
#
further
random
forest
library
(
ROCR
)
#
for
illustration
of
classification
performance
library
(
dplyr
)
#
for
data
wrangling
```
```{
r
Load
and
prepare
data
}
#
Load
data
load
(
"../data/classification2.RData"
)
#
Derive
and
investigate
the
dependent
variable
"number of residents"
adults
<-
as
.
integer
(
ifelse
(
customers
$
residents
.
numAdult
==
"5 oder mehr"
,
"5"
,
customers
$
residents
.
numAdult
))
children
<-
as
.
integer
(
ifelse
(
customers
$
residents
.
numChildren
==
"5 oder mehr"
,
"5"
,
customers
$
residents
.
numChildren
))
table
(
ifelse
(
is
.
na
(
children
),
adults
,
adults
+
children
))
#
think
in
classes
.
we
have
some
very
rare
classes
of
number
of
residents
(>
5
)
#
Prepare
pNumResidents
to
have
multiple
classes
customers
$
pNumResidents
<-
sapply
(
ifelse
(
is
.
na
(
children
),
adults
,
adults
+
children
),
function
(
a
)
{
if
(
a
==
0
||
is
.
na
(
a
)){
return
(
NA
)
}
else
if
(
a
==
1
){
return
(
"1 person"
)
}
else
if
(
a
==
2
){
return
(
"2 persons"
)
}
else
if
(
a
<=
5
){
return
(
"3-5 persons"
)
}
else
{
return
(
">5 persons"
)
}
})
customers
$
pNumResidents
<-
ordered
(
customers
$
pNumResidents
,
levels
=
c
(
"1 person"
,
"2 persons"
,
"3-5 persons"
,
">5 persons"
))
table
(
customers
$
pNumResidents
)
#
Prepare
pNumResidents
to
have
two
(
binary
)
classes
customers
$
pNumResidents2
<-
sapply
(
ifelse
(
is
.
na
(
children
),
adults
,
adults
+
children
),
function
(
a
)
{
if
(
a
==
0
||
is
.
na
(
a
)){
return
(
NA
)
}
else
if
(
a
<=
2
){
return
(
"1-2 persons"
)
}
else
{
return
(
">2 persons"
)
}
})
customers
$
pNumResidents2
<-
ordered
(
customers
$
pNumResidents2
,
levels
=
c
(
"1-2 persons"
,
">2 persons"
))
table
(
customers
$
pNumResidents2
)
all_data
<-
left_join
(
all_data
,
customers
%>%
select
(
VID
,
pNumResidents
,
pNumResidents2
,
housing
.
type
),
by
=
"VID"
)
```
```{
r
Classification
with
probabilities
-
binary
class
}
selected
.
features
<-
c
(
"c_week"
,
"c_morning"
,
"c_noon"
,
"c_afternoon"
,
"c_evening"
,
"s_we_max"
,
"s_we_min"
,
"s_wd_max"
)
set
.
seed
(
1506
)
test
.
cases
<-
match
(
sample
(
all_data
$
VID
,
size
=
.3
*
nrow
(
all_data
)),
all_data
$
VID
)
all_data
testdata
<-
na
.
omit
(
all_data
[
test
.
cases
,
c
(
"pNumResidents2"
,
selected
.
features
)])
traindata
<-
na
.
omit
(
all_data
[-
test
.
cases
,
c
(
"pNumResidents2"
,
selected
.
features
)])
##
SVM
with
probabilities
-------
model
<-
svm
(
pNumResidents2
~
.
,
data
=
traindata
,
probability
=
T
)
clres
<-
predict
(
model
,
newdata
=
testdata
,
probability
=
T
)
cm
<-
table
(
clres
,
testdata
$
pNumResidents2
)
(
accuracy
<-
(
sum
(
diag
(
cm
))/
sum
(
as
.
vector
(
cm
))))
#
for
ROC
curve
,
the
probabilities
as
estimated
by
the
classifier
are
needed
propabilities
<-
attributes
(
clres
)$
probabilities
[,
1
]
#
FROM
HERE
ON
EXERCISE
pred
<-
prediction
(
predictions
=
propabilities
,
labels
=
testdata
$
pNumResidents2
)
performance
(
pred
,
"auc"
)
#
the
ROC
curve
shows
true
-
positive
-
rate
vs
.
false
-
positive
rate
plot
(
performance
(
pred
,
"tpr"
,
"fpr"
))
```
```{
r
Classification
with
probabilities
-
multiclass
}
testdata
<-
na
.
omit
(
all_data
[
test
.
cases
,
c
(
"pNumResidents"
,
selected
.
features
)])
traindata
<-
na
.
omit
(
all_data
[-
test
.
cases
,
c
(
"pNumResidents"
,
selected
.
features
)])
model
<-
svm
(
pNumResidents
~
.
,
data
=
traindata
,
probability
=
T
)
clres
<-
predict
(
model
,
newdata
=
testdata
,
probability
=
T
)
cm
<-
table
(
clres
,
testdata
$
pNumResidents
)
(
accuracy
<-
(
sum
(
diag
(
cm
))/
sum
(
as
.
vector
(
cm
))))
#
for
ROC
curve
,
the
probabilities
as
estimated
by
the
classifier
are
needed
propabilities
<-
attributes
(
clres
)$
probabilities
#
FROM
HERE
ON
EXERCISE
#
It
's a multiclass problem but We are only interested in the class "1 person"
pred <- prediction(predictions = propabilities[,"1 person"], labels = testdata$pNumResidents == "1 person")
#the area under ROC curve is a good evaluation criterion
performance(pred, "auc")
#the ROC curve shows true-positive-rate vs. false-positive rate
plot(performance(pred, "tpr", "fpr"))
```
```{r Classification with an advanced evaluation technique: cross-validation}
set.seed(1506)
all_data$crossfolds <- sample(1:5, nrow(all_data), replace = TRUE)
# list for the interim results
results <- list()
for(foldIndex in 1:5){
# creating data for the
testdata <- na.omit(all_data[all_data$crossfolds==foldIndex, c("pNumResidents2", selected.features)])
traindata <- na.omit(all_data[all_data$crossfolds!=foldIndex, c("pNumResidents2", selected.features)])
model <- svm(pNumResidents2 ~ . , data=traindata, probability = T)
results[[foldIndex]] <- list()
results[[foldIndex]]$model <- model
clres <- predict(model, newdata=testdata, probability = T)
results[[foldIndex]]$probs <- attributes(clres)$probabilities
cm <- table(clres, testdata$pNumResidents2)
results[[foldIndex]]$cm <- cm
results[[foldIndex]]$accuracy <- (accuracy <- (sum(diag(cm))/sum(as.vector(cm))))
results[[foldIndex]]$labels <- testdata$pNumResidents2
}
# FROM HERE ON EXERCISE
# calculate the average accuracy
accuracy_array <- simplify2array(lapply(X = results, FUN = function(i) i$accuracy))
accuracy_mean <- mean(accuracy_array)
accuracy_mean
# print the different roc curves for each model and calculate the average auc, your target class is ">2 persons"
propabilities_list <- lapply(X = results, FUN = function(i) i$probs[,">2 persons"])
label_list <- lapply(X = results, FUN = function(i) i$labels)
pred <- prediction(predictions = propabilities_list, labels = label_list)
auc_array <- unlist(performance(pred, "auc")@y.values)
auc_mean <- mean(auc_array)
auc_mean
#the ROC curve shows true-positive-rate vs. false-positive rate
plot(performance(pred, "tpr", "fpr"))
```
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment