BIA_L07_Clustering_SMD.Rmd 3.93 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
---
title: "Clustering with smart meter data"
output: html_notebook
editor_options: 
  chunk_output_type: inline
---

```{r load and format data}
# load and prepare the data ----
SMD_Readings=read.csv2("../data/SMD_BIA_Data.csv")

# we use the data for a single week
time_stamps <- as.numeric(as.character(unique(SMD_Readings$Timestamp)))
single_week <- as.character(time_stamps[time_stamps>25000  & time_stamps<25700])
length(single_week)

# select the measurements for one single week
SMD_Readings <- SMD_Readings[SMD_Readings$Timestamp %in% single_week, ]

# the reshape package helps us to transform the data innto a more condensed table form
#install.packages("reshape2")
library("reshape2")

Formatted_Readings <- dcast(SMD_Readings, ID ~ Timestamp, value.var = "Consumption")
head(Formatted_Readings)

# remmove the IDs from the matrix
IDs <- Formatted_Readings[,1]
Formatted_Readings[,1] <- NULL
```

```{r first k-Means clustering}
# simple k-means clustering ----
set.seed(1)
Cluster1 <- kmeans(Formatted_Readings, centers=3)
```


```{r function to plot the results}
# Creating cluster visualization ----
plotcl <- function(SMD_Readings, clusters,lim=c(0,2), ...){
  plot(colMeans(SMD_Readings[,1:48]), type="n", ylim=lim, ...)
  for(i in unique(clusters)){
    lines(colMeans(SMD_Readings[clusters==i,1:48]),col=i, lwd=3)
  }
}

plotclWeek <- function(SMD_Readings, clusters,lim=c(0,2), ...){
  plot(colMeans(SMD_Readings[,]),type="n", ylim=lim, ...)
  for(i in unique(clusters)){
    lines(colMeans(SMD_Readings[clusters==i,]),col=i, lwd=3)
  }
}

plotcl(Formatted_Readings, Cluster1$cluster, ylab="Consumption (kWh)")
plotclWeek(Formatted_Readings, Cluster1$cluster, ylab="Consumption (kWh)")
```

```{r second k-Meand clustering}
# Improve the clustering ----
#Normalize the values and then run the k-means again
set.seed(2)
Data_Norm <- Formatted_Readings/rowMeans(Formatted_Readings)
Cluster2 <- kmeans(Data_Norm, centers=3)
plotcl(Formatted_Readings, Cluster2$cluster, ylab="Consumption (kWh)", main="Max-normaized consumption", lim=c(0,1.2))

#Transform the values to a more normally distributed form and run the k-means agaein
Datas <- sqrt(Formatted_Readings)
Datasn <- Datas/rowMeans(Datas)
set.seed(4)
Cluster3 <- kmeans(Datasn, centers=3)
plotcl(Formatted_Readings, Cluster3$cluster, ylab="Consumption (kWh)", main="Sqrt and Max-normaized consumption", lim=c(0,1.2))

```

```{r obtain the optimal number of clusters}
set.seed(2)
Clusters <- list()
Clusters[[1]] <- kmeans(Datasn, centers=2)
Clusters[[2]] <- kmeans(Datasn, centers=3)
Clusters[[3]] <- kmeans(Datasn, centers=4)
Clusters[[4]] <- kmeans(Datasn, centers=5)
Clusters[[5]] <- kmeans(Datasn, centers=6)
Clusters[[6]] <- kmeans(Datasn, centers=7)
Clusters[[7]] <- kmeans(Datasn, centers=8)
Clusters[[8]] <- kmeans(Datasn, centers=9)

#the total sum of squares
tot.withinss <- sapply(Clusters, function(v){return(v$tot.withinss)})
plot(2:9, tot.withinss, xlab="Num. of clusters", ylab="Total sum of squares", type="b")

#the min / max sum of squares in the clusters
min.withinss <- sapply(Clusters, function(v){return(min(v$withinss))})
max.withinss <- sapply(Clusters, function(v){return(max(v$withinss))})
plot(2:9, max.withinss, xlab="Num. of clusters", 
     ylab="Within clusters sum of squares", type="b", ylim=c(min(min.withinss), max(max.withinss)))
lines(2:9, min.withinss, type="b", col=2)
legend("topright", c("Max. WSS", "Min. WSS"), col=c(1,2), lty=1)

numOneElemClusters <- sapply(Clusters, function(v){return(sum(v$size==1))})
barplot(numOneElemClusters, names.arg = 2:9, main="Single-element clusters", 
        xlab="Total number of k-Means clusters")
```


```{r hierarchical clustering}
library(cluster)

#create distance matrix
C <- 1-cor(t(Formatted_Readings))

Dendrogram <- agnes(C,diss=T,method="complete")
plot(Dendrogram, which.plot=2) #plot the dendrogram

Cluster4 <- cutree(Dendogram, k=3)
plotcl(Formatted_Readings, Cluster4, main="Hierarchical clustering results", ylab="Consumption (kWh)", lim=c(0,1.3))

```