Commit f2d9a513 authored by Weigert, Andreas's avatar Weigert, Andreas
Browse files

updated Tutorial 7 and 8

parent 017388e6
---
title: 'Tutorial 8: Clustering'
output: html_notebook
editor_options:
chunk_output_type: inline
---
This file is part of the lecture Business Intelligence & Analytics (EESYS-BIA-M), Information Systems and Energy Efficient Systems, University of Bamberg.
```{r Load libraries}
library(dplyr)
library(lubridate)
library(psych)
library(cluster)
library(readr)
```
```{r Read and prepare data}
# read data
consumption <- read_csv(file="../../data/clustering/bam_energy_report_consumption.csv", na = "NULL")
customer <- read_csv(file="../../data/clustering/bam_energy_report_customers.csv", na = "NULL")
logins <- read_csv(file="../../data/clustering/bam_energy_report_logins.csv", na = "NULL")
survey <- read_csv(file="../../data/clustering/bam_energy_report_survey.csv", na = "NULL")
portal_points <- read_csv(file="../../data/clustering/bam_energy_report_portal_points.csv", na = "NULL")
# convert data
consumption$CustomerID <- as.character(consumption$CustomerID)
customer$CustomerID <- as.character(customer$CustomerID)
logins$CustomerID <- as.character(logins$CustomerID)
survey$CustomerID <- as.character(survey$CustomerID)
portal_points$CustomerID <- as.character(portal_points$CustomerID)
# aggregate and calculate data
survey$answers <- rowSums(!is.na(survey[,-1]))
portal_points_agg <- portal_points %>% group_by(CustomerID) %>% summarize(points = sum(Points), different_actions = n_distinct(TaskCode))
logins_agg <- logins %>% group_by(CustomerID) %>% summarize(n_logins = n())
consumption$consumption_normalized <- consumption$consumption_2012 / consumption$billing_days_2012
# Join data together
data <- customer %>% left_join(consumption, by="CustomerID") %>%
left_join(logins_agg, by="CustomerID") %>%
left_join(survey, by="CustomerID") %>%
left_join(portal_points_agg, by="CustomerID")
```
```{r Clustering 1 - A first try with extreme values}
data_clustering <- data %>% select(CustomerID, NumDevices, LivingAreaM2, HouseholdMembers, n_logins, points, different_actions) %>% na.omit()
# simple clustering
k <- 3
set.seed(1)
cluster1 <- kmeans(x = data_clustering[,-1], centers = k)
data_clustering$kmeans_cluster1 <- cluster1$cluster
table(data_clustering$kmeans_cluster1)
# What we see: One cluster with only one customer inside. why? Let's look at the values
describeBy((data_clustering %>% select(-one_of(c("CustomerID")))), group="kmeans_cluster1", skew=FALSE)
# extreme value at the number of logins
plot(data_clustering$n_logins, data_clustering$kmeans_cluster1)
```
```{r Clustering 2 - A second try with untransformed data}
```
```{r Clustering 3 - A third try with transformed data}
```
```{r Determine the right number of clusters}
```
```{r Clustering 4 - Using an hierarchical clustering approach}
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment