Commit 5ebe6b14 authored by Weigert, Andreas's avatar Weigert, Andreas
Browse files

added solution for part 1 (data analytics)

parent 84e40704
---
title: 'Tutorial 6: Newsletter Case'
output: html_notebook
editor_options:
chunk_output_type: inline
---
This file is part of the lecture Business Intelligence & Analytics (EESYS-BIA-M), Information Systems and Energy Efficient Systems, University of Bamberg.
```{r Load libraries}
library(dplyr)
library(lubridate)
```
```{r read the files with basic R functions}
nl_mailsSend <- read.csv2("../data/newsletter/newsletterData_mailsSend.csv",
encoding = "UTF-8", # this is usually not needed
stringsAsFactors = F # R formats columns with text as factor,
# this is not meaningful in our case
)
nl_clicks <- read.csv2("../data/newsletter/newsletterData_clicks.csv",
encoding = "UTF-8", stringsAsFactors = F)
nl_links <- read.csv2("../data/newsletter/newsletterData_links.csv",
encoding = "UTF-8", stringsAsFactors = F)
nl_opens <- read.csv2("../data/newsletter/newsletterData_opens.csv",
encoding = "UTF-8", stringsAsFactors = F)
```
```{r inspect the data}
summary(nl_mailsSend)
str(nl_mailsSend)
summary(nl_clicks)
str(nl_clicks)
summary(nl_links)
str(nl_links)
summary(nl_opens)
str(nl_opens)
```
```{r format the data correctly (exercise 3-5)}
# format the nl_mailsSend dataset
nl_mailsSend$VID <- as.factor(nl_mailsSend$VID)
nl_mailsSend$EmailID <- as.factor(nl_mailsSend$EmailID)
nl_mailsSend$NumTipIDs <- as.factor(nl_mailsSend$NumTipIDs)
nl_mailsSend$NewsletterTitle <- as.factor(nl_mailsSend$NewsletterTitle)
nl_mailsSend$EnergyReport.EfficiencyLevel <- as.factor(nl_mailsSend$EnergyReport.EfficiencyLevel)
nl_mailsSend$EnergyReport.HouseholdType <- as.factor(nl_mailsSend$EnergyReport.HouseholdType)
nl_mailsSend$EnergyReport.HouseholdMembers <- as.factor(nl_mailsSend$EnergyReport.HouseholdMembers)
# format the nl_clicks dataset
nl_clicks$LinkID <- as.factor(nl_clicks$LinkID)
# format the nl_links dataset
nl_links$EmailID <- as.factor(nl_links$EmailID)
nl_links$LinkID <- as.factor(nl_links$LinkID)
# format the nl_opens dataset
nl_opens$EmailID <- as.factor(nl_opens$EmailID)
```
```{r format the date columns (exercise 6-7)}
# format the date and time values with lubridate functions
nl_mailsSend$SendDate <- ymd_hms(nl_mailsSend$SendDate)
nl_mailsSend$EnergyReport.PeriodStart <- ymd_hms(nl_mailsSend$EnergyReport.PeriodStart)
nl_mailsSend$EnergyReport.PeriodEnd <- ymd_hms(nl_mailsSend$EnergyReport.PeriodEnd)
nl_clicks$ClickDate <- ymd_hms(nl_clicks$ClickDate)
nl_opens$OpenDate <- ymd_hms(nl_opens$OpenDate)
# optional: format the date and time values with basic R functions
# nl_mailsSend$SendDate <- strptime(nl_mailsSend$SendDate, format="%F %T", tz="UTC")
# nl_mailsSend$EnergyReport.PeriodStart <- strptime(nl_mailsSend$EnergyReport.PeriodStart, format="%FT%T", tz="UTC")
# nl_mailsSend$EnergyReport.PeriodEnd <- strptime(nl_mailsSend$EnergyReport.PeriodEnd, format="%FT%T", tz="UTC")
#
# nl_opens$OpenDate <- strptime(nl_opens$OpenDate, format="%F %T", tz="UTC")
#
# nl_clicks$ClickDate <- strptime(nl_clicks$ClickDate, format="%F %T", tz="UTC")
```
```{r statistics on time (exercise 8+9)}
# exercise 8
min(nl_mailsSend$SendDate)
max(nl_mailsSend$SendDate)
# exercise 9
table(as_date(nl_mailsSend$SendDate))
```
```{r dplyr expercises}
# exercises 10-13
filter(nl_mailsSend, VID == "1467")
filter(nl_mailsSend, as_date(SendDate) == ymd(20170404))
select(nl_mailsSend, VID, EmailID, SendDate)
arrange(nl_mailsSend, desc(SendDate))
# exercise 14
X_grouped <- mutate(nl_mailsSend, day_send = as_date(SendDate))
X_grouped <- group_by(X_grouped, day_send)
summarise(X_grouped, n_mails = n())
# exercise 15
X_grouped2 <- group_by(nl_mailsSend, NewsletterTitle)
summarise(X_grouped2, n_mails = n(), avg_time = mean(SendDate))
```
```{r dplyr exercises with piping}
# exercise 16 (new version of 14)
nl_mailsSend %>%
mutate(day_send = as_date(SendDate)) %>%
group_by(day_send) %>%
summarise(n_mails = n())
# exercise 16 (new version of 15)
nl_mailsSend %>%
group_by(NewsletterTitle) %>%
summarise(n_mails = n(),
avg_time = mean(SendDate))
# exercise 17
nl_mailsSend %>%
mutate(day_send = as_date(SendDate)) %>%
filter(day_send==ymd(20170404)) %>%
select(VID, EmailID, SendDate)
```
```{r exericses using joins}
email_opened <- nl_mailsSend %>%
left_join(nl_opens, by="EmailID") %>% # we also need the rows with no match in the opens table
group_by(EmailID) %>%
summarise(opened = any(!is.na(OpenDate)))
mean(email_opened$opened)
nl_mailsSend %>%
left_join(nl_opens, by="EmailID") %>% # we also need the rows with no match in the opens table
group_by(EmailID) %>%
summarise(title = first(NewsletterTitle),
opened = any(!is.na(OpenDate))) %>%
group_by(title) %>%
summarise(openrate = mean(opened))
nl_mailsSend %>%
left_join(nl_links, by="EmailID") %>%
left_join(nl_clicks, by="LinkID") %>%
group_by(EmailID) %>%
#this is pretty much the same as above
summarise(title = first(NewsletterTitle),
clicked = any(!is.na(ClickDate))) %>%
group_by(title) %>%
summarise(clickrate = mean(clicked))
```
```{r visualization}
# exercise 23
#exercise 24
#exercise 25
# exercise 26
# exercise 27
```
```{r histogram with additional lines - exercise 28 + 29}
# exercise 27-29
```
```{r bar and pie charts with colors (exercise 30 extended)}
```
```{r plot showing email opens and clicks over time}
# identify all opens per day
# identify all clicks per day
# combine clicks and opens to one data frame
# replace NA values with 0
# a simple plot will show a misleading picture
# create rows for all days with no actions
# find the points in time when the newsletter were send
```
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment