#################################################################
# Code for: Quantitative User Experience Research
# Chapter 9 -- Log Sequence Visualization
#
# Authors:  Chris Chapman & Kerry Rodden
#
# Copyright (c) 2023, Chris Chapman & Kerry Rodden
#
# Last update: March 3, 2023
# Version: 1.0
#
# Licensed under the MIT License (the "License");
# you may not use this file except in compliance with the License.
#
# You may obtain a complete copy of the License in the accompanying file
# "MIT-license.txt" or at: https://opensource.org/licenses/MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.

#################################################################
# BRIEF HOW TO USE
# This file contains scripts used in Chapter 9 of Chapman & Rodden (2023),
#   "Quantitative User Experience Research", Apress.
#
# We recommend readers to type the code from the book as that
# accelerates learning. However, this file may be used instead;
# just step through it section by section to match the book.
#
# Note that there may be code sections and comments here that
# do not appear in the book, with supplementary information.
#################################################################

# sunburst example

# create example sequence data
set.seed(10010)    # make the data repeatable
foods <- c("Pastry", "Granola", "Yogurt", "Potatoes", "Eggs")
N.obs <- 5000
num.events <- sample(4, N.obs, replace = TRUE)
table(num.events)

one.event <- function(len, dat, prob=((length(dat)+2):3),
                      replace=TRUE, sep="-") {
  event <- sample(dat, len, prob=prob, replace = replace)
  paste0(event, collapse=sep)
}

one.event(3, foods)
one.event(6, foods)

events <- sapply(num.events, one.event, dat=foods)
str(events)

library(car)
some(events)

events.freq <- data.frame(table(events))
head(events.freq)

library(sunburstR)
library(RColorBrewer)   # install if necessary
breakfastPalette <- brewer.pal(5, "Set1")
# breakfastPalette <- c("#E41A1C","#377EB8","#4DAF4A","#984EA3","#FF7F00")
sunburst(events.freq, colors = breakfastPalette)


### load data from R-book sequences
###
### in this section, we get the web log, sessionize it, and filter for
### HTML pages only
###
### For reference and more details, see R for Marketing Research and Analytics,
### 2nd ed., Chapter 14 (Chapman & Feit, 2019)
###

# get basic data
# this file starts after basic data processing noted in Chapman & Feit, Ch. 14
epa.df <- readRDS(gzcon(url("https://goo.gl/s5vjWz")))
head(epa.df)

# extract sessions
# 1. put DF in order of host and timestamp
epa.ordered <- epa.df[order(epa.df$host, epa.df$datetime), ]

# 2. get time differences between rows in minutes
epa.ordered$time.diff <-
  c(NA, as.numeric(
          epa.ordered$datetime[2:nrow(epa.ordered)] -
          epa.ordered$datetime[1:(nrow(epa.ordered)-1)],
        units="mins"))

# 3. determine new sessions, as being either:
# .. 1: host has changed since previous row
# .. 2: time difference exceeds session cutoff time of 15 minutes
session.time              <- 15   # exceed (mins) ==> new session
epa.ordered$newsession    <- NA   # is this row a new session?
epa.ordered$newsession[1] <- TRUE # row 1 is always a new session

epa.ordered$newsession[2:nrow(epa.ordered)]  <-
  ifelse(epa.ordered$host[2:nrow(epa.ordered)] !=
           epa.ordered$host[1:(nrow(epa.ordered)-1)], # hosts differ
         TRUE,                                        # => new session
         epa.ordered$time.diff[2:nrow(epa.ordered)] >=
           session.time )                 # else new if time exceeded

# 4. finalize session numbers & initial time differences
epa.ordered$session <- cumsum(epa.ordered$newsession)
epa.ordered$time.diff[epa.ordered$newsession] <- NA  # time NA for new

# 5. remove everything except HTML pages
epa.html <- epa.ordered[epa.ordered$pagetype=="html", ]

# 6. check a few to make sure they worked
epa.html[1:5, c(1, 13, 10)]

### sunburst processing
###
### In this section, we combine the session pages for sequential orders
### and count the occurrences for each order

# first, changes dashes in the page names to underscores (_),
# because "-" separates sequences in the sunburst data
epa.html$page <- gsub("-", "_", epa.html$page)

# now split epa.html$page into separate data frames for each unique session
epa.chunks <- split(epa.html$page, epa.html$session)
head(epa.chunks)

# assemble those pages into 1 sequence string for each chunk
# set maximum length to be 5 pages for tidier sunburst
epa.sequences <- data.frame(sequence=sapply(
      epa.chunks,
      function(x)
        paste0(x[1:min(length(x), 5)], collapse="-")))

epa.sequences[1:2, ]


# count the occurrences of each sequence
epa.sequences.freq <- data.frame(table(epa.sequences$sequence))
head(epa.sequences.freq, 3)
# how many have more than a single occurrence of the sequence?
table(epa.sequences.freq$Freq > 1)

### sunburst chart where frequency is greater than 1
library(sunburstR)
sunburst(data=subset(epa.sequences.freq, Freq > 1))