Commit 462720ba authored by Weigert, Andreas's avatar Weigert, Andreas
Browse files
parents 12a85b77 c908fd69
---
title: "Clustering with smart meter data"
output: html_notebook
editor_options:
chunk_output_type: inline
---
```{r load and format data}
# load and prepare the data ----
SMD_Readings=read.csv2("../data/SMD_BIA_Data.csv")
# we use the data for a single week
time_stamps <- as.numeric(as.character(unique(SMD_Readings$Timestamp)))
single_week <- as.character(time_stamps[time_stamps>25000 & time_stamps<25700])
length(single_week)
# select the measurements for one single week
SMD_Readings <- SMD_Readings[SMD_Readings$Timestamp %in% single_week, ]
# the reshape package helps us to transform the data innto a more condensed table form
#install.packages("reshape2")
library("reshape2")
Formatted_Readings <- dcast(SMD_Readings, ID ~ Timestamp, value.var = "Consumption")
head(Formatted_Readings)
# remmove the IDs from the matrix
IDs <- Formatted_Readings[,1]
Formatted_Readings[,1] <- NULL
```
```{r first k-Means clustering}
# simple k-means clustering ----
set.seed(1)
Cluster1 <- kmeans(Formatted_Readings, centers=3)
```
```{r function to plot the results}
# Creating cluster visualization ----
plotcl <- function(SMD_Readings, clusters,lim=c(0,2), ...){
plot(colMeans(SMD_Readings[,1:48]), type="n", ylim=lim, ...)
for(i in unique(clusters)){
lines(colMeans(SMD_Readings[clusters==i,1:48]),col=i, lwd=3)
}
}
plotclWeek <- function(SMD_Readings, clusters,lim=c(0,2), ...){
plot(colMeans(SMD_Readings[,]),type="n", ylim=lim, ...)
for(i in unique(clusters)){
lines(colMeans(SMD_Readings[clusters==i,]),col=i, lwd=3)
}
}
plotcl(Formatted_Readings, Cluster1$cluster, ylab="Consumption (kWh)")
plotclWeek(Formatted_Readings, Cluster1$cluster, ylab="Consumption (kWh)")
```
```{r second k-Meand clustering}
# Improve the clustering ----
#Normalize the values and then run the k-means again
set.seed(2)
Data_Norm <- Formatted_Readings/rowMeans(Formatted_Readings)
Cluster2 <- kmeans(Data_Norm, centers=3)
plotcl(Formatted_Readings, Cluster2$cluster, ylab="Consumption (kWh)", main="Max-normaized consumption", lim=c(0,1.2))
#Transform the values to a more normally distributed form and run the k-means agaein
Datas <- sqrt(Formatted_Readings)
Datasn <- Datas/rowMeans(Datas)
Datasn <- Datasn[-297] #remove outlier
set.seed(7)
Cluster3 <- kmeans(Datasn,
centers=3)
plotcl(Formatted_Readings, Cluster3$cluster, ylab="Consumption (kWh)", main="Sqrt and Max-normaized consumption", lim=c(0,1.2))
#some details about the model
table(Cluster3$cluster) # size of the clusters 1
Cluster3$size # size of the clusters 2
Cluster3$withinss # variance in the clusters
Cluster3$betweenss # variance between the clusters
```
```{r obtain the optimal number of clusters}
Clusters <- list()
for(i in 1:8){
set.seed(7)
Clusters[[i]] <- kmeans(Datasn, centers=i+1)
}
Clusters[[2]]$size
#the total sum of squares
tot.withinss <- sapply(Clusters, function(v){return(v$tot.withinss)})
plot(2:9, tot.withinss, xlab="Num. of clusters", ylab="Total sum of squares", type="b")
#the min / max sum of squares in the clusters
min.withinss <- sapply(Clusters, function(v){return(min(v$withinss))})
max.withinss <- sapply(Clusters, function(v){return(max(v$withinss))})
plot(2:9, max.withinss, xlab="Num. of clusters",
ylab="Within clusters sum of squares", type="b", ylim=c(min(min.withinss), max(max.withinss)))
lines(2:9, min.withinss, type="b", col=2)
legend("topright", c("Max. WSS", "Min. WSS"), col=c(1,2), lty=1)
numOneElemClusters <- sapply(Clusters, function(v){return(sum(v$size==1))})
barplot(numOneElemClusters, names.arg = 2:9, main="Single-element clusters",
xlab="Total number of k-Means clusters")
```
```{r hierarchical clustering}
library(cluster)
#create distance matrix
C <- 1-cor(t(Formatted_Readings))
Dendrogram <- agnes(C,diss=T,method="complete")
plot(Dendrogram, which.plot=2) #plot the dendrogram
Cluster4 <- cutree(Dendrogram, k=4)
plotcl(Formatted_Readings, Cluster4, main="Hierarchical clustering results", ylab="Consumption (kWh)", lim=c(0,1.3))
```
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment