BIA_T06_CS_newsletter_responses.Rmd 5.61 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
---
title:  'Tutorial 6: Newsletter Case'
output: html_notebook
editor_options: 
  chunk_output_type: inline
---

This file is part of the lecture Business Intelligence & Analytics (EESYS-BIA-M), Information Systems and Energy Efficient Systems, University of Bamberg.


```{r Load libraries}
library(dplyr)
library(lubridate)
```



```{r read the files with basic R functions}
nl_mailsSend <- read.csv2("../data/newsletter/newsletterData_mailsSend.csv",
           encoding = "UTF-8", # this is usually not needed
           stringsAsFactors = F # R formats columns with text as factor, 
                                # this is not meaningful in our case
           ) 
nl_clicks <- read.csv2("../data/newsletter/newsletterData_clicks.csv",
           encoding = "UTF-8", stringsAsFactors = F)
nl_links <- read.csv2("../data/newsletter/newsletterData_links.csv",
           encoding = "UTF-8", stringsAsFactors = F)
nl_opens <- read.csv2("../data/newsletter/newsletterData_opens.csv",
           encoding = "UTF-8", stringsAsFactors = F)
```

```{r inspect the data}
summary(nl_mailsSend)
str(nl_mailsSend)

summary(nl_clicks)
str(nl_clicks)

summary(nl_links)
str(nl_links)

summary(nl_opens)
str(nl_opens)
```

```{r format the data correctly (exercise 3-5)}
# format the nl_mailsSend dataset

nl_mailsSend$VID <- as.factor(nl_mailsSend$VID)
nl_mailsSend$EmailID <- as.factor(nl_mailsSend$EmailID)
nl_mailsSend$NumTipIDs <- as.factor(nl_mailsSend$NumTipIDs)
nl_mailsSend$NewsletterTitle <- as.factor(nl_mailsSend$NewsletterTitle)
nl_mailsSend$EnergyReport.EfficiencyLevel <- as.factor(nl_mailsSend$EnergyReport.EfficiencyLevel)
nl_mailsSend$EnergyReport.HouseholdType <- as.factor(nl_mailsSend$EnergyReport.HouseholdType)
nl_mailsSend$EnergyReport.HouseholdMembers <- as.factor(nl_mailsSend$EnergyReport.HouseholdMembers)

# format the nl_clicks dataset

nl_clicks$LinkID <- as.factor(nl_clicks$LinkID)

# format the nl_links dataset

nl_links$EmailID <- as.factor(nl_links$EmailID)
nl_links$LinkID <- as.factor(nl_links$LinkID)

# format the nl_opens dataset

nl_opens$EmailID <- as.factor(nl_opens$EmailID)
```


```{r format the date columns (exercise 6-7)}

# format the date and time values with lubridate functions
nl_mailsSend$SendDate <- ymd_hms(nl_mailsSend$SendDate)
nl_mailsSend$EnergyReport.PeriodStart <- ymd_hms(nl_mailsSend$EnergyReport.PeriodStart)
nl_mailsSend$EnergyReport.PeriodEnd <- ymd_hms(nl_mailsSend$EnergyReport.PeriodEnd)

nl_clicks$ClickDate <- ymd_hms(nl_clicks$ClickDate)

nl_opens$OpenDate <- ymd_hms(nl_opens$OpenDate)


# optional: format the date and time values with basic R functions

# nl_mailsSend$SendDate <- strptime(nl_mailsSend$SendDate, format="%F %T", tz="UTC")
# nl_mailsSend$EnergyReport.PeriodStart <- strptime(nl_mailsSend$EnergyReport.PeriodStart, format="%FT%T", tz="UTC")
# nl_mailsSend$EnergyReport.PeriodEnd <- strptime(nl_mailsSend$EnergyReport.PeriodEnd, format="%FT%T", tz="UTC")
# 
# nl_opens$OpenDate <- strptime(nl_opens$OpenDate, format="%F %T", tz="UTC")
# 
# nl_clicks$ClickDate <- strptime(nl_clicks$ClickDate, format="%F %T", tz="UTC")
```


```{r statistics on time (exercise 8+9)}

# exercise 8
min(nl_mailsSend$SendDate)
max(nl_mailsSend$SendDate)

# exercise 9
table(as_date(nl_mailsSend$SendDate))
```

```{r dplyr expercises}
# exercises 10-13
filter(nl_mailsSend, VID == "1467")
filter(nl_mailsSend, as_date(SendDate) == ymd(20170404)) 
select(nl_mailsSend, VID, EmailID, SendDate)
arrange(nl_mailsSend, desc(SendDate))

# exercise 14
X_grouped <- mutate(nl_mailsSend, day_send = as_date(SendDate))
X_grouped <- group_by(X_grouped, day_send)
summarise(X_grouped, n_mails = n())

# exercise 15
X_grouped2 <- group_by(nl_mailsSend, NewsletterTitle)
summarise(X_grouped2, n_mails = n(), avg_time = mean(SendDate))

```

```{r dplyr exercises with piping}
# exercise 16 (new version of 14)
nl_mailsSend %>% 
  mutate(day_send = as_date(SendDate)) %>%
  group_by(day_send) %>%
  summarise(n_mails = n())

# exercise 16 (new version of 15)
nl_mailsSend %>%
  group_by(NewsletterTitle) %>%
  summarise(n_mails = n(), 
            avg_time = mean(SendDate))

# exercise 17
nl_mailsSend %>% 
  mutate(day_send = as_date(SendDate)) %>%
  filter(day_send==ymd(20170404)) %>%
  select(VID, EmailID, SendDate)
```

```{r exericses using joins}
email_opened <- nl_mailsSend %>%
  left_join(nl_opens, by="EmailID") %>% # we also need the rows with no match in the opens table
  group_by(EmailID) %>%
  summarise(opened = any(!is.na(OpenDate))) 
mean(email_opened$opened)

nl_mailsSend %>%
  left_join(nl_opens, by="EmailID") %>% # we also need the rows with no match in the opens table
  group_by(EmailID) %>%
  summarise(title = first(NewsletterTitle), 
            opened = any(!is.na(OpenDate))) %>%
  group_by(title) %>%
  summarise(openrate = mean(opened))


nl_mailsSend %>%
  left_join(nl_links, by="EmailID") %>%
  left_join(nl_clicks, by="LinkID") %>%
  group_by(EmailID) %>%
  #this is pretty much the same as above
  summarise(title = first(NewsletterTitle), 
            clicked = any(!is.na(ClickDate))) %>%
  group_by(title) %>%
  summarise(clickrate = mean(clicked))

```

```{r visualization}
# exercise 23

#exercise 24

#exercise 25

# exercise 26

# exercise 27

```
```{r histogram with additional lines - exercise 28 + 29}
# exercise 27-29

```



```{r bar and pie charts with colors (exercise 30 extended)}

```


```{r plot showing email opens and clicks over time}

# identify all opens per day

# identify all clicks per day

# combine clicks and opens to one data frame


# replace NA values with 0

# a simple plot will show a misleading picture


# create rows for all days with no actions

# find the points in time when the newsletter were send
```