Commit 1d9f74be authored by Hopf, Konstantin's avatar Hopf, Konstantin
Browse files

Merge branch 'master' of gitlab.rz.uni-bamberg.de:eesys-public/2018-ws-bia

parents 158ad496 70b6cf85
...@@ -189,7 +189,7 @@ APC$Filter_IsConsumptionOutlier2 <- (is.outlier_sigma(APC$NCons_2011, sigma = 2) ...@@ -189,7 +189,7 @@ APC$Filter_IsConsumptionOutlier2 <- (is.outlier_sigma(APC$NCons_2011, sigma = 2)
``` ```
When using the IQR-method, `r prop.table(table(APC$Filter_IsConsumptionOutlier))[2]*100` % of the values are identified as outliers. When using the IQR-method, `r prop.table(table(APC$Filter_IsConsumptionOutlier))[2]*100` % of the values are identified as outliers.
When using the IQR-method, `r prop.table(table(APC$Filter_IsConsumptionOutlier2))[2]*100` % of the values are identified as outliers. When using the Sigma-method, `r prop.table(table(APC$Filter_IsConsumptionOutlier2))[2]*100` % of the values are identified as outliers.
After identifiying outliers you need to decide how to deal with them. Typically, you can tim, winsorize, or label outliers. Finally it depends on the business questions what strategy is the best. After identifiying outliers you need to decide how to deal with them. Typically, you can tim, winsorize, or label outliers. Finally it depends on the business questions what strategy is the best.
......
---
title: 'Tutorial 6: Newsletter Case'
output: html_notebook
editor_options:
chunk_output_type: inline
---
This file is part of the lecture Business Intelligence & Analytics (EESYS-BIA-M), Information Systems and Energy Efficient Systems, University of Bamberg.
```{r Load libraries}
library(dplyr)
library(lubridate)
```
```{r read the files with basic R functions}
nl_mailsSend <- read.csv2("../data/newsletter/newsletterData_mailsSend.csv",
encoding = "UTF-8", # this is usually not needed
stringsAsFactors = F # R formats columns with text as factor,
# this is not meaningful in our case
)
nl_clicks <- read.csv2("../data/newsletter/newsletterData_clicks.csv",
encoding = "UTF-8", stringsAsFactors = F)
nl_links <- read.csv2("../data/newsletter/newsletterData_links.csv",
encoding = "UTF-8", stringsAsFactors = F)
nl_opens <- read.csv2("../data/newsletter/newsletterData_opens.csv",
encoding = "UTF-8", stringsAsFactors = F)
```
```{r inspect the data}
summary(nl_mailsSend)
str(nl_mailsSend)
summary(nl_clicks)
str(nl_clicks)
summary(nl_links)
str(nl_links)
summary(nl_opens)
str(nl_opens)
```
```{r format the data correctly (exercise 3-5)}
# format the nl_mailsSend dataset
nl_mailsSend$VID <- as.factor(nl_mailsSend$VID)
nl_mailsSend$EmailID <- as.factor(nl_mailsSend$EmailID)
nl_mailsSend$NumTipIDs <- as.factor(nl_mailsSend$NumTipIDs)
nl_mailsSend$NewsletterTitle <- as.factor(nl_mailsSend$NewsletterTitle)
nl_mailsSend$EnergyReport.EfficiencyLevel <- as.factor(nl_mailsSend$EnergyReport.EfficiencyLevel)
nl_mailsSend$EnergyReport.HouseholdType <- as.factor(nl_mailsSend$EnergyReport.HouseholdType)
nl_mailsSend$EnergyReport.HouseholdMembers <- as.factor(nl_mailsSend$EnergyReport.HouseholdMembers)
# format the nl_clicks dataset
nl_clicks$LinkID <- as.factor(nl_clicks$LinkID)
# format the nl_links dataset
nl_links$EmailID <- as.factor(nl_links$EmailID)
nl_links$LinkID <- as.factor(nl_links$LinkID)
# format the nl_opens dataset
nl_opens$EmailID <- as.factor(nl_opens$EmailID)
```
```{r format the date columns (exercise 6-7)}
# format the date and time values with lubridate functions
nl_mailsSend$SendDate <- ymd_hms(nl_mailsSend$SendDate)
nl_mailsSend$EnergyReport.PeriodStart <- ymd_hms(nl_mailsSend$EnergyReport.PeriodStart)
nl_mailsSend$EnergyReport.PeriodEnd <- ymd_hms(nl_mailsSend$EnergyReport.PeriodEnd)
nl_clicks$ClickDate <- ymd_hms(nl_clicks$ClickDate)
nl_opens$OpenDate <- ymd_hms(nl_opens$OpenDate)
# optional: format the date and time values with basic R functions
# nl_mailsSend$SendDate <- strptime(nl_mailsSend$SendDate, format="%F %T", tz="UTC")
# nl_mailsSend$EnergyReport.PeriodStart <- strptime(nl_mailsSend$EnergyReport.PeriodStart, format="%FT%T", tz="UTC")
# nl_mailsSend$EnergyReport.PeriodEnd <- strptime(nl_mailsSend$EnergyReport.PeriodEnd, format="%FT%T", tz="UTC")
#
# nl_opens$OpenDate <- strptime(nl_opens$OpenDate, format="%F %T", tz="UTC")
#
# nl_clicks$ClickDate <- strptime(nl_clicks$ClickDate, format="%F %T", tz="UTC")
```
```{r statistics on time (exercise 8+9)}
# exercise 8
min(nl_mailsSend$SendDate)
max(nl_mailsSend$SendDate)
# exercise 9
table(as_date(nl_mailsSend$SendDate))
```
```{r dplyr expercises}
# exercises 10-13
filter(nl_mailsSend, VID == "1467")
filter(nl_mailsSend, as_date(SendDate) == ymd(20170404))
select(nl_mailsSend, VID, EmailID, SendDate)
arrange(nl_mailsSend, desc(SendDate))
# exercise 14
X_grouped <- mutate(nl_mailsSend, day_send = as_date(SendDate))
X_grouped <- group_by(X_grouped, day_send)
summarise(X_grouped, n_mails = n())
# exercise 15
X_grouped2 <- group_by(nl_mailsSend, NewsletterTitle)
summarise(X_grouped2, n_mails = n(), avg_time = mean(SendDate))
```
```{r dplyr exercises with piping}
# exercise 16 (new version of 14)
nl_mailsSend %>%
mutate(day_send = as_date(SendDate)) %>%
group_by(day_send) %>%
summarise(n_mails = n())
# exercise 16 (new version of 15)
nl_mailsSend %>%
group_by(NewsletterTitle) %>%
summarise(n_mails = n(),
avg_time = mean(SendDate))
# exercise 17
nl_mailsSend %>%
mutate(day_send = as_date(SendDate)) %>%
filter(day_send==ymd(20170404)) %>%
select(VID, EmailID, SendDate)
```
```{r exericses using joins}
email_opened <- nl_mailsSend %>%
left_join(nl_opens, by="EmailID") %>% # we also need the rows with no match in the opens table
group_by(EmailID) %>%
summarise(opened = any(!is.na(OpenDate)))
mean(email_opened$opened)
nl_mailsSend %>%
left_join(nl_opens, by="EmailID") %>% # we also need the rows with no match in the opens table
group_by(EmailID) %>%
summarise(title = first(NewsletterTitle),
opened = any(!is.na(OpenDate))) %>%
group_by(title) %>%
summarise(openrate = mean(opened))
nl_mailsSend %>%
left_join(nl_links, by="EmailID") %>%
left_join(nl_clicks, by="LinkID") %>%
group_by(EmailID) %>%
#this is pretty much the same as above
summarise(title = first(NewsletterTitle),
clicked = any(!is.na(ClickDate))) %>%
group_by(title) %>%
summarise(clickrate = mean(clicked))
```
```{r visualization}
# exercise 23
#exercise 24
#exercise 25
# exercise 26
# exercise 27
```
```{r histogram with additional lines - exercise 28 + 29}
# exercise 27-29
```
```{r bar and pie charts with colors (exercise 30 extended)}
```
```{r plot showing email opens and clicks over time}
# identify all opens per day
# identify all clicks per day
# combine clicks and opens to one data frame
# replace NA values with 0
# a simple plot will show a misleading picture
# create rows for all days with no actions
# find the points in time when the newsletter were send
```
---
title: "Clustering with smart meter data"
output: html_notebook
editor_options:
chunk_output_type: inline
---
```{r load and format data}
# load and prepare the data ----
SMD_Readings=read.csv2("../data/SMD_BIA_Data.csv")
# we use the data for a single week
time_stamps <- as.numeric(as.character(unique(SMD_Readings$Timestamp)))
single_week <- as.character(time_stamps[time_stamps>25000 & time_stamps<25700])
length(single_week)
# select the measurements for one single week
SMD_Readings <- SMD_Readings[SMD_Readings$Timestamp %in% single_week, ]
# the reshape package helps us to transform the data innto a more condensed table form
#install.packages("reshape2")
library("reshape2")
Formatted_Readings <- dcast(SMD_Readings, ID ~ Timestamp, value.var = "Consumption")
head(Formatted_Readings)
# remmove the IDs from the matrix
IDs <- Formatted_Readings[,1]
Formatted_Readings[,1] <- NULL
```
```{r first k-Means clustering}
# simple k-means clustering ----
set.seed(1)
Cluster1 <- kmeans(Formatted_Readings, centers=3)
```
```{r function to plot the results}
# Creating cluster visualization ----
plotcl <- function(SMD_Readings, clusters,lim=c(0,2), ...){
plot(colMeans(SMD_Readings[,1:48]), type="n", ylim=lim, ...)
for(i in unique(clusters)){
lines(colMeans(SMD_Readings[clusters==i,1:48]),col=i, lwd=3)
}
}
plotclWeek <- function(SMD_Readings, clusters,lim=c(0,2), ...){
plot(colMeans(SMD_Readings[,]),type="n", ylim=lim, ...)
for(i in unique(clusters)){
lines(colMeans(SMD_Readings[clusters==i,]),col=i, lwd=3)
}
}
plotcl(Formatted_Readings, Cluster1$cluster, ylab="Consumption (kWh)")
plotclWeek(Formatted_Readings, Cluster1$cluster, ylab="Consumption (kWh)")
```
```{r second k-Meand clustering}
# Improve the clustering ----
#Normalize the values and then run the k-means again
set.seed(2)
Data_Norm <- Formatted_Readings/rowMeans(Formatted_Readings)
Cluster2 <- kmeans(Data_Norm, centers=3)
plotcl(Formatted_Readings, Cluster2$cluster, ylab="Consumption (kWh)", main="Max-normaized consumption", lim=c(0,1.2))
#Transform the values to a more normally distributed form and run the k-means agaein
Datas <- sqrt(Formatted_Readings)
Datasn <- Datas/rowMeans(Datas)
set.seed(4)
Cluster3 <- kmeans(Datasn, centers=3)
plotcl(Formatted_Readings, Cluster3$cluster, ylab="Consumption (kWh)", main="Sqrt and Max-normaized consumption", lim=c(0,1.2))
```
```{r obtain the optimal number of clusters}
set.seed(2)
Clusters <- list()
Clusters[[1]] <- kmeans(Datasn, centers=2)
Clusters[[2]] <- kmeans(Datasn, centers=3)
Clusters[[3]] <- kmeans(Datasn, centers=4)
Clusters[[4]] <- kmeans(Datasn, centers=5)
Clusters[[5]] <- kmeans(Datasn, centers=6)
Clusters[[6]] <- kmeans(Datasn, centers=7)
Clusters[[7]] <- kmeans(Datasn, centers=8)
Clusters[[8]] <- kmeans(Datasn, centers=9)
#the total sum of squares
tot.withinss <- sapply(Clusters, function(v){return(v$tot.withinss)})
plot(2:9, tot.withinss, xlab="Num. of clusters", ylab="Total sum of squares", type="b")
#the min / max sum of squares in the clusters
min.withinss <- sapply(Clusters, function(v){return(min(v$withinss))})
max.withinss <- sapply(Clusters, function(v){return(max(v$withinss))})
plot(2:9, max.withinss, xlab="Num. of clusters",
ylab="Within clusters sum of squares", type="b", ylim=c(min(min.withinss), max(max.withinss)))
lines(2:9, min.withinss, type="b", col=2)
legend("topright", c("Max. WSS", "Min. WSS"), col=c(1,2), lty=1)
numOneElemClusters <- sapply(Clusters, function(v){return(sum(v$size==1))})
barplot(numOneElemClusters, names.arg = 2:9, main="Single-element clusters",
xlab="Total number of k-Means clusters")
```
```{r hierarchical clustering}
library(cluster)
#create distance matrix
C <- 1-cor(t(Formatted_Readings))
Dendrogram <- agnes(C,diss=T,method="complete")
plot(Dendrogram, which.plot=2) #plot the dendrogram
Cluster4 <- cutree(Dendogram, k=3)
plotcl(Formatted_Readings, Cluster4, main="Hierarchical clustering results", ylab="Consumption (kWh)", lim=c(0,1.3))
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment