BIA_T06_CS_newsletter_responses.Rmd 8.41 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
---
title:  'Tutorial 6: Newsletter Case'
output: html_notebook
editor_options: 
  chunk_output_type: inline
---

This file is part of the lecture Business Intelligence & Analytics (EESYS-BIA-M), Information Systems and Energy Efficient Systems, University of Bamberg.


```{r Load libraries}
library(dplyr)
library(lubridate)
```



```{r read the files with basic R functions}
nl_mailsSend <- read.csv2("../data/newsletter/newsletterData_mailsSend.csv",
           encoding = "UTF-8", # this is usually not needed
           stringsAsFactors = F # R formats columns with text as factor, 
                                # this is not meaningful in our case
           ) 
nl_clicks <- read.csv2("../data/newsletter/newsletterData_clicks.csv",
           encoding = "UTF-8", stringsAsFactors = F)
nl_links <- read.csv2("../data/newsletter/newsletterData_links.csv",
           encoding = "UTF-8", stringsAsFactors = F)
nl_opens <- read.csv2("../data/newsletter/newsletterData_opens.csv",
           encoding = "UTF-8", stringsAsFactors = F)
```

```{r inspect the data}
summary(nl_mailsSend)
str(nl_mailsSend)

summary(nl_clicks)
str(nl_clicks)

summary(nl_links)
str(nl_links)

summary(nl_opens)
str(nl_opens)
```

```{r format the data correctly (exercise 3-5)}
# format the nl_mailsSend dataset

nl_mailsSend$VID <- as.factor(nl_mailsSend$VID)
nl_mailsSend$EmailID <- as.factor(nl_mailsSend$EmailID)
nl_mailsSend$NumTipIDs <- as.factor(nl_mailsSend$NumTipIDs)
nl_mailsSend$NewsletterTitle <- as.factor(nl_mailsSend$NewsletterTitle)
nl_mailsSend$EnergyReport.EfficiencyLevel <- as.factor(nl_mailsSend$EnergyReport.EfficiencyLevel)
nl_mailsSend$EnergyReport.HouseholdType <- as.factor(nl_mailsSend$EnergyReport.HouseholdType)
nl_mailsSend$EnergyReport.HouseholdMembers <- as.factor(nl_mailsSend$EnergyReport.HouseholdMembers)

# format the nl_clicks dataset

nl_clicks$LinkID <- as.factor(nl_clicks$LinkID)

# format the nl_links dataset

nl_links$EmailID <- as.factor(nl_links$EmailID)
nl_links$LinkID <- as.factor(nl_links$LinkID)

# format the nl_opens dataset

nl_opens$EmailID <- as.factor(nl_opens$EmailID)
```


```{r format the date columns (exercise 6-7)}

# format the date and time values with lubridate functions
nl_mailsSend$SendDate <- ymd_hms(nl_mailsSend$SendDate)
nl_mailsSend$EnergyReport.PeriodStart <- ymd_hms(nl_mailsSend$EnergyReport.PeriodStart)
nl_mailsSend$EnergyReport.PeriodEnd <- ymd_hms(nl_mailsSend$EnergyReport.PeriodEnd)

nl_clicks$ClickDate <- ymd_hms(nl_clicks$ClickDate)

nl_opens$OpenDate <- ymd_hms(nl_opens$OpenDate)


# optional: format the date and time values with basic R functions

# nl_mailsSend$SendDate <- strptime(nl_mailsSend$SendDate, format="%F %T", tz="UTC")
# nl_mailsSend$EnergyReport.PeriodStart <- strptime(nl_mailsSend$EnergyReport.PeriodStart, format="%FT%T", tz="UTC")
# nl_mailsSend$EnergyReport.PeriodEnd <- strptime(nl_mailsSend$EnergyReport.PeriodEnd, format="%FT%T", tz="UTC")
# 
# nl_opens$OpenDate <- strptime(nl_opens$OpenDate, format="%F %T", tz="UTC")
# 
# nl_clicks$ClickDate <- strptime(nl_clicks$ClickDate, format="%F %T", tz="UTC")
```


```{r statistics on time (exercise 8+9)}

# exercise 8
min(nl_mailsSend$SendDate)
max(nl_mailsSend$SendDate)

# exercise 9
table(as_date(nl_mailsSend$SendDate))
```

```{r dplyr expercises}
# exercises 10-13
filter(nl_mailsSend, VID == "1467")
filter(nl_mailsSend, as_date(SendDate) == ymd(20170404)) 
select(nl_mailsSend, VID, EmailID, SendDate)
arrange(nl_mailsSend, desc(SendDate))

# exercise 14
X_grouped <- mutate(nl_mailsSend, day_send = as_date(SendDate))
X_grouped <- group_by(X_grouped, day_send)
summarise(X_grouped, n_mails = n())

# exercise 15
X_grouped2 <- group_by(nl_mailsSend, NewsletterTitle)
summarise(X_grouped2, n_mails = n(), avg_time = mean(SendDate))

```

```{r dplyr exercises with piping}
# exercise 16 (new version of 14)
nl_mailsSend %>% 
  mutate(day_send = as_date(SendDate)) %>%
  group_by(day_send) %>%
  summarise(n_mails = n())

# exercise 16 (new version of 15)
nl_mailsSend %>%
  group_by(NewsletterTitle) %>%
  summarise(n_mails = n(), 
            avg_time = mean(SendDate))

# exercise 17
nl_mailsSend %>% 
  mutate(day_send = as_date(SendDate)) %>%
  filter(day_send==ymd(20170404)) %>%
  select(VID, EmailID, SendDate)
```

```{r exericses using joins}
email_opened <- nl_mailsSend %>%
  left_join(nl_opens, by="EmailID") %>% # we also need the rows with no match in the opens table
  group_by(EmailID) %>%
  summarise(opened = any(!is.na(OpenDate))) 
mean(email_opened$opened)

nl_mailsSend %>%
  left_join(nl_opens, by="EmailID") %>% # we also need the rows with no match in the opens table
  group_by(EmailID) %>%
  summarise(title = first(NewsletterTitle), 
            opened = any(!is.na(OpenDate))) %>%
  group_by(title) %>%
  summarise(openrate = mean(opened))


nl_mailsSend %>%
  left_join(nl_links, by="EmailID") %>%
  left_join(nl_clicks, by="LinkID") %>%
  group_by(EmailID) %>%
  #this is pretty much the same as above
  summarise(title = first(NewsletterTitle), 
            clicked = any(!is.na(ClickDate))) %>%
  group_by(title) %>%
  summarise(clickrate = mean(clicked))

```

```{r visualization}
# exercise 23
174
plot(nl_mailsSend$EnergyReport.Cons)
175
176

#exercise 24
177
178
plot(nl_mailsSend$EnergyReport.Cons, nl_mailsSend$EnergyReport.PrevCons)
plot(nl_mailsSend$EnergyReport.Cons ~ nl_mailsSend$EnergyReport.PrevCons)
179
180

#exercise 25
181
182
plot(nl_mailsSend$EnergyReport.Cons, log="y")
plot(nl_mailsSend$EnergyReport.Cons, nl_mailsSend$EnergyReport.PrevCons, log="xy")
183
184

# exercise 26
185
boxplot(nl_mailsSend$EnergyReport.Cons)
186
187

# exercise 27
188
hist(nl_mailsSend$EnergyReport.Cons, breaks = 30)
189
190
191
```
```{r histogram with additional lines - exercise 28 + 29}
# exercise 27-29
192
193
194
195
196
197
hist(nl_mailsSend$EnergyReport.Cons, 
     probability = T, # probability must be used to have the same scale as density
     breaks=80, # increases the number of bars of the histogram
     ylim = c(0,0.001)) 
lines(density(nl_mailsSend$EnergyReport.Cons), col=2)
abline(v=quantile(nl_mailsSend$EnergyReport.Cons, probs = c(0.25,0.5,0.75)), col=3)
198
199
200
201
```


```{r bar and pie charts with colors (exercise 30 extended)}
202
203
204
205
206
207
208
209
210
211
212
213
214
library(RColorBrewer)
mycolors <- brewer.pal(4, "Dark2")

barplot(table(as_date(nl_mailsSend$SendDate)), # barplot needs a frequency table
        main="Number of emails sent per newsletter", # the plot title
        horiz = T, # horizontal barplot
        las=1, # aligns the axis labels to the reading direction
        cex.names = 0.6, # adjust size of the labels
        col=mycolors[c(1,1,1,1,2,3,4)]) # set the colors

pie(table(as_date(nl_mailsSend$SendDate)),
    col=mycolors[c(1,1,1,1,2,3,4)], 
    main="Number of emails sent per newsletter")
215
216
217
218
219
220
```


```{r plot showing email opens and clicks over time}

# identify all opens per day
221
222
223
224
actions_open <- nl_opens %>%
  mutate(day_action = as_date(OpenDate)) %>% 
  group_by(day_action) %>%
  summarise(num_opens = n())
225
226

# identify all clicks per day
227
228
229
230
actions_click <- nl_clicks %>%
  mutate(day_action = as_date(ClickDate)) %>% 
  group_by(day_action) %>%
  summarise(num_clicks = n())
231
232

# combine clicks and opens to one data frame
233
actions_all <- full_join(actions_open, actions_click, by="day_action")  
234
235

# replace NA values with 0
236
237
actions_all$num_opens <- ifelse(is.na(actions_all$num_opens), 0, actions_all$num_opens)
actions_all$num_clicks <- ifelse(is.na(actions_all$num_clicks), 0, actions_all$num_clicks)
238
239

# a simple plot will show a misleading picture
240
plot(actions_all$num_opens ~actions_all$day_action, type="b")
241
242

# create rows for all days with no actions
243
244
245
246
247
248
days_noaction <- data.frame(
  day_action = seq(from = ymd(20170404), to = ymd(20171010), by = "days"),
  num_clicks = 0,
  num_opens = 0) %>% anti_join(actions_all, by = "day_action")

actions_all <- rbind(actions_all, days_noaction) %>% arrange(day_action)
249
250

# find the points in time when the newsletter were send
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
senddates_newsletter <- nl_mailsSend %>% 
  mutate(nl_uniquename = paste(NewsletterTitle, EnergyReport.Quarter)) %>%
  group_by(nl_uniquename) %>% summarise(date_send = mean(SendDate))

plot(actions_all$num_opens ~ actions_all$day_action, type="l",
     ylim=c(0,250), xlab="Time", ylab="Number of email opens / clicks")
abline(h=seq(from=0, to=250, by=50), lty="dashed", col="gray", lwd=2)
lines(actions_all$num_clicks ~ actions_all$day_action, col=2)

abline(v=as_date(senddates_newsletter$date_send), col=4, lty=3)

legend("topleft", legend = c("Time of email energy report",
                             "Number of email opens",
                             "Number of clicks"), col=c(4,1,2), 
       lty=c(2,1,1), bg = "White")
266
267
268
```