Commit c1a8d587 authored by Hopf, Konstantin's avatar Hopf, Konstantin
Browse files

Materials for L07 on clustering

parent 2162618c
title: "Clustering with smart meter data"
output: html_notebook
chunk_output_type: inline
```{r load and format data}
# load and prepare the data ----
# we use the data for a single week
time_stamps <- as.numeric(as.character(unique(SMD_Readings$Timestamp)))
single_week <- as.character(time_stamps[time_stamps>25000 & time_stamps<25700])
# select the measurements for one single week
SMD_Readings <- SMD_Readings[SMD_Readings$Timestamp %in% single_week, ]
# the reshape package helps us to transform the data innto a more condensed table form
Formatted_Readings <- dcast(SMD_Readings, ID ~ Timestamp, value.var = "Consumption")
# remmove the IDs from the matrix
IDs <- Formatted_Readings[,1]
Formatted_Readings[,1] <- NULL
```{r first k-Means clustering}
# simple k-means clustering ----
Cluster1 <- kmeans(Formatted_Readings, centers=3)
```{r function to plot the results}
# Creating cluster visualization ----
plotcl <- function(SMD_Readings, clusters,lim=c(0,2), ...){
plot(colMeans(SMD_Readings[,1:48]), type="n", ylim=lim, ...)
for(i in unique(clusters)){
lines(colMeans(SMD_Readings[clusters==i,1:48]),col=i, lwd=3)
plotclWeek <- function(SMD_Readings, clusters,lim=c(0,2), ...){
plot(colMeans(SMD_Readings[,]),type="n", ylim=lim, ...)
for(i in unique(clusters)){
lines(colMeans(SMD_Readings[clusters==i,]),col=i, lwd=3)
plotcl(Formatted_Readings, Cluster1$cluster, ylab="Consumption (kWh)")
plotclWeek(Formatted_Readings, Cluster1$cluster, ylab="Consumption (kWh)")
```{r second k-Meand clustering}
# Improve the clustering ----
#Normalize the values and then run the k-means again
Data_Norm <- Formatted_Readings/rowMeans(Formatted_Readings)
Cluster2 <- kmeans(Data_Norm, centers=3)
plotcl(Formatted_Readings, Cluster2$cluster, ylab="Consumption (kWh)", main="Max-normaized consumption", lim=c(0,1.2))
#Transform the values to a more normally distributed form and run the k-means agaein
Datas <- sqrt(Formatted_Readings)
Datasn <- Datas/rowMeans(Datas)
Cluster3 <- kmeans(Datasn, centers=3)
plotcl(Formatted_Readings, Cluster3$cluster, ylab="Consumption (kWh)", main="Sqrt and Max-normaized consumption", lim=c(0,1.2))
```{r obtain the optimal number of clusters}
Clusters <- list()
Clusters[[1]] <- kmeans(Datasn, centers=2)
Clusters[[2]] <- kmeans(Datasn, centers=3)
Clusters[[3]] <- kmeans(Datasn, centers=4)
Clusters[[4]] <- kmeans(Datasn, centers=5)
Clusters[[5]] <- kmeans(Datasn, centers=6)
Clusters[[6]] <- kmeans(Datasn, centers=7)
Clusters[[7]] <- kmeans(Datasn, centers=8)
Clusters[[8]] <- kmeans(Datasn, centers=9)
#the total sum of squares
tot.withinss <- sapply(Clusters, function(v){return(v$tot.withinss)})
plot(2:9, tot.withinss, xlab="Num. of clusters", ylab="Total sum of squares", type="b")
#the min / max sum of squares in the clusters
min.withinss <- sapply(Clusters, function(v){return(min(v$withinss))})
max.withinss <- sapply(Clusters, function(v){return(max(v$withinss))})
plot(2:9, max.withinss, xlab="Num. of clusters",
ylab="Within clusters sum of squares", type="b", ylim=c(min(min.withinss), max(max.withinss)))
lines(2:9, min.withinss, type="b", col=2)
legend("topright", c("Max. WSS", "Min. WSS"), col=c(1,2), lty=1)
numOneElemClusters <- sapply(Clusters, function(v){return(sum(v$size==1))})
barplot(numOneElemClusters, names.arg = 2:9, main="Single-element clusters",
xlab="Total number of k-Means clusters")
```{r hierarchical clustering}
#create distance matrix
C <- 1-cor(t(Formatted_Readings))
Dendrogram <- agnes(C,diss=T,method="complete")
plot(Dendrogram, which.plot=2) #plot the dendrogram
Cluster4 <- cutree(Dendogram, k=3)
plotcl(Formatted_Readings, Cluster4, main="Hierarchical clustering results", ylab="Consumption (kWh)", lim=c(0,1.3))
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment