--- title: "Clustering with smart meter data" output: html_notebook editor_options: chunk_output_type: inline --- ```{r load and format data} # load and prepare the data ---- SMD_Readings=read.csv2("../data/SMD_BIA_Data.csv") # we use the data for a single week time_stamps <- as.numeric(as.character(unique(SMD_Readings$Timestamp))) single_week <- as.character(time_stamps[time_stamps>25000 & time_stamps<25700]) length(single_week) # select the measurements for one single week SMD_Readings <- SMD_Readings[SMD_Readings$Timestamp %in% single_week, ] # the reshape package helps us to transform the data innto a more condensed table form #install.packages("reshape2") library("reshape2") Formatted_Readings <- dcast(SMD_Readings, ID ~ Timestamp, value.var = "Consumption") head(Formatted_Readings) # remmove the IDs from the matrix IDs <- Formatted_Readings[,1] Formatted_Readings[,1] <- NULL ``` ```{r first k-Means clustering} # simple k-means clustering ---- set.seed(1) Cluster1 <- kmeans(Formatted_Readings, centers=3) ``` ```{r function to plot the results} # Creating cluster visualization ---- plotcl <- function(SMD_Readings, clusters,lim=c(0,2), ...){ plot(colMeans(SMD_Readings[,1:48]), type="n", ylim=lim, ...) for(i in unique(clusters)){ lines(colMeans(SMD_Readings[clusters==i,1:48]),col=i, lwd=3) } } plotclWeek <- function(SMD_Readings, clusters,lim=c(0,2), ...){ plot(colMeans(SMD_Readings[,]),type="n", ylim=lim, ...) for(i in unique(clusters)){ lines(colMeans(SMD_Readings[clusters==i,]),col=i, lwd=3) } } plotcl(Formatted_Readings, Cluster1$cluster, ylab="Consumption (kWh)") plotclWeek(Formatted_Readings, Cluster1$cluster, ylab="Consumption (kWh)") ``` ```{r second k-Meand clustering} # Improve the clustering ---- #Normalize the values and then run the k-means again set.seed(2) Data_Norm <- Formatted_Readings/rowMeans(Formatted_Readings) Cluster2 <- kmeans(Data_Norm, centers=3) plotcl(Formatted_Readings, Cluster2$cluster, ylab="Consumption (kWh)", main="Max-normaized consumption", lim=c(0,1.2)) #Transform the values to a more normally distributed form and run the k-means agaein Datas <- sqrt(Formatted_Readings) Datasn <- Datas/rowMeans(Datas) Datasn <- Datasn[-297] #remove outlier set.seed(7) Cluster3 <- kmeans(Datasn, centers=3) plotcl(Formatted_Readings, Cluster3$cluster, ylab="Consumption (kWh)", main="Sqrt and Max-normaized consumption", lim=c(0,1.2)) #some details about the model table(Cluster3$cluster) # size of the clusters 1 Cluster3$size # size of the clusters 2 Cluster3$withinss # variance in the clusters Cluster3$betweenss # variance between the clusters ``` ```{r obtain the optimal number of clusters} Clusters <- list() for(i in 1:8){ set.seed(7) Clusters[[i]] <- kmeans(Datasn, centers=i+1) } Clusters[[2]]$size #the total sum of squares tot.withinss <- sapply(Clusters, function(v){return(v$tot.withinss)}) plot(2:9, tot.withinss, xlab="Num. of clusters", ylab="Total sum of squares", type="b") #the min / max sum of squares in the clusters min.withinss <- sapply(Clusters, function(v){return(min(v$withinss))}) max.withinss <- sapply(Clusters, function(v){return(max(v$withinss))}) plot(2:9, max.withinss, xlab="Num. of clusters", ylab="Within clusters sum of squares", type="b", ylim=c(min(min.withinss), max(max.withinss))) lines(2:9, min.withinss, type="b", col=2) legend("topright", c("Max. WSS", "Min. WSS"), col=c(1,2), lty=1) numOneElemClusters <- sapply(Clusters, function(v){return(sum(v$size==1))}) barplot(numOneElemClusters, names.arg = 2:9, main="Single-element clusters", xlab="Total number of k-Means clusters") ``` ```{r hierarchical clustering} library(cluster) #create distance matrix C <- 1-cor(t(Formatted_Readings)) Dendrogram <- agnes(C,diss=T,method="complete") plot(Dendrogram, which.plot=2) #plot the dendrogram Cluster4 <- cutree(Dendrogram, k=4) plotcl(Formatted_Readings, Cluster4, main="Hierarchical clustering results", ylab="Consumption (kWh)", lim=c(0,1.3)) ```