#################################################################
# Code for: Quantitative User Experience Research
# Chapter 10 -- MaxDiff
#
# Authors:  Chris Chapman & Kerry Rodden
#
# Copyright (c) 2023, Chris Chapman & Kerry Rodden
#
# Last update: March 3, 2023
# Version: 1.0
#
# Licensed under the MIT License (the "License");
# you may not use this file except in compliance with the License.
#
# You may obtain a complete copy of the License in the accompanying file
# "MIT-license.txt" or at: https://opensource.org/licenses/MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.

#################################################################
# BRIEF HOW TO USE
# This file contains scripts used in Chapter 10 of Chapman & Rodden (2023),
#   "Quantitative User Experience Research", Apress.
#
# We recommend readers to type the code from the book as that
# accelerates learning. However, this file may be used instead;
# just step through it section by section to match the book.
#
# Note that there may be code sections and comments here that
# do not appear in the book, with supplementary information.
#################################################################


# MaxDiff analysis code for example data
# Chapman & Rodden, 2023

### set up various choicetools dependencies
required.packages <- c("reshape2", "ggplot2", "mlogit", "ChoiceModelR",
                       "Rmisc", "matrixStats", "superheat", "corrplot",
                       "ggridges", "Hmisc", "car", "devtools")
needed.packages <- required.packages[!(required.packages %in% installed.packages())]
if (length(needed.packages) > 0) {
  install.packages(needed.packages)
}
# install the experimental choicetools package from GitHub
# will only install if it doesn't exist or is out of date
devtools::install_github("cnchapman/choicetools")


### EXAMPLE ONE: Pizza MaxDiff preference data

# load choicetools package to work with Qualtrics MaxDiff format data
library(choicetools)

# get the pizza data file and save it locally
# change the folder destination if needed
# NOTE: requires live internet access to the book's website
# alternative: download all data sets separately, and skip this step
if (!file.exists("qualtrics-pizza-maxdiff.csv")) {
  download.file(url = "https://bit.ly/3FvVoNE",
                destfile = "qualtrics-pizza-maxdiff.csv",
                method="auto")
}

# set up the study object
# change the data file location if needed for your system
md.define <- parse.md.qualtrics("qualtrics-pizza-maxdiff.csv",
                                returnList=TRUE)
# load the data
md.define$md.block <- read.md.qualtrics(md.define)$md.block

# check some data [omitted from book]
md.define$md.block[1:10, c(1:3, 5, 6, 10, 13:16)]

# estimate the hierarchical Bayes model
# following takes about 12 seconds on Chris's laptop
test.hb <- md.hb(md.define, mcmc.iters = 10000,
                 mcmc.seed=98101)                     # estimation (note: set mcmc.iters appropriately)
md.define$md.model.hb    <- test.hb$md.model          # save the results into our study object
md.define$md.hb.betas    <- test.hb$md.hb.betas       # raw betas for MNL
md.define$md.hb.betas.zc <- test.hb$md.hb.betas.zc    # zero-centered diff scores by individual (rec'd)

# plot the results -- sample level averages
plot.md.range(md.define) +
  theme_minimal() +
  xlab("Pizza Toppings")

# plot the results -- distribution of individual estimates
plot.md.indiv(md.define) +                       # create plot of HB model with individual mean betas
  theme_minimal() +
  ylab("Pizza Toppings")

# calculate pizza demand for 6 pizzas to be offered
names(md.define$md.hb.betas)
pizza.util <- md.define$md.hb.betas[ , c("Margherita", "Mushroom", "Arrabbiata",
                                         "Pepperoni", "Arugula", "Marinara")]
# average utility values
colMeans(pizza.util)
# exponentiated values that can be summed for aggregate "demand"
exp(colMeans(pizza.util))
# the preference share of each pizza, in percentage of the total demand in the set
# note rounding errors that make it appear to sum to 98 instead of 100 (try round(... , 2))
round(100 * exp(colMeans(pizza.util)) / sum(exp(colMeans(pizza.util))))


### EXAMPLE TWO: SIMULATED INFORMATION USAGE DATA

# General Note & Warning:
# Qualtrics formats are likely to change over time and this code may need updating,
# which is unfortunately outside the scope of the book.
# This code works with data formats used at the time of publication but
# may be expected to need adaptation over time. Be sure to test results
# for consistency as noted in the text!


# set up structure by parsing and interpreting the Qualtrics CSV
library(choicetools)

# get the use case example data
# change the folder destination if needed
# NOTE: requires live internet access to the book's website
# alternative: download all data sets separately, and skip this step
if (!file.exists("qualtrics-maxdiff-usecases.csv")) {
  download.file(url = "https://bit.ly/3SRnq9l",
                destfile = "qualtrics-maxdiff-usecases.csv",
                method="auto")
}

# check the structure
# change the file location to match your system as needed
parse.md.qualtrics("qualtrics-maxdiff-usecases.csv")

# set up the study object md.define
# change the file location to match your system as needed
md.define <- parse.md.qualtrics("qualtrics-maxdiff-usecases.csv",
                                returnList = TRUE)

# read the data (can be slow for large data sets)
md.define$md.block <- read.md.qualtrics(md.define)$md.block

# check some data
head(md.define$md.block)
# quick plot of the counts
plot.md.counts(md.define)


###
### START: this block is not in the book
###
# try the md.quicklogit() function for a quick check
# note that one reference item is omitted in a traditional logit model
# NOTE: this is omitted from the book, as it mostly duplicates the check from
# a counts plot
md.define$md.model.logit <- md.quicklogit(md.define)
# plot it
md.plot.logit(md.define)
###
### END: part that is not in the book
###


# estimate the HB model
# note: slow. estimation takes about 4 minutes on Chris's laptop
test.hb <- md.hb(md.define, mcmc.iters = 10000,
                 mcmc.seed = 98101)                   # note: set mcmc.iters appropriately
# save the estimates
md.define$md.model.hb    <- test.hb$md.model          # save the results into our study object
md.define$md.hb.betas    <- test.hb$md.hb.betas       # raw utilities by individual
md.define$md.hb.betas.zc <- test.hb$md.hb.betas.zc    # zero-centered diff scores by individual (rec'd)
rm(test.hb)
summary(md.define$md.hb.betas.zc)

# sample level average plot
plot.md.range(md.define) +
  theme_minimal() +
  xlab("Information Use Cases")

# individual level distribution
plot.md.indiv(md.define) +                       # create plot of HB model with individual mean betas
  theme_minimal() +
  ylab("Information Use Cases")

# How to add iterations to the HB model:
# We suggest to add 2000-5000 iterations at a time.
# For example, to add 20000, we could add 10 blocks of 2000.
# This helps avoid slowdowns due to memory usage.
# In this case, you can ignore warnings about total iterations being low.
for (i in 1:10) {
  test.hb <- md.hb(md.define, mcmc.iters = 2000,
                 restart = TRUE)                # add 10000 iterations; slow
}

# put the results back into the study object
md.define$md.model.hb    <- test.hb$md.model          # save the results into our study object
md.define$md.hb.betas    <- test.hb$md.hb.betas       # raw utilities by individual
md.define$md.hb.betas.zc <- test.hb$md.hb.betas.zc    # zero-centered diff scores by individual (rec'd)

# and plot the results
plot.md.range(md.define) +                # upper level ranges
  theme_minimal() +
  xlab("Information Use Cases")

plot.md.indiv(md.define) +                # distribution of individuals
  theme_minimal() +
  ylab("Information Use Cases")


######################
# EXTRA CODE
######################
# Following code is NOT covered in the book
# but may be relevant if you wish to do one of these tasks:
#
# 1. Simulate different data for your own testing purposes
# 2. Inspect the Qualtrics data format to translate some other data
# 3. Create new Qualtrics data headers to match a data set
# 4. Replace corrupt or otherwise unusable Qualtrics data headers
#
# This code is all HIGHLY EXPERIMENTAL and lightly tested
# although it was used to create the "information seeking" data set
# in the book.
#
# BEWARE that there are a few "magic numbers" here and there. Read and
# adjust those as needed.
#

# CREATE SIMULATED INFORMATION TASK DATA SET
# this is the data set used in "Example 2" above
#
# these are the items whose responses we'll simulate
online.tasks <- c(
  "Banking", "Casual gaming", "Driving directions","Email",
  "Entertaining videos", "Local events", "Multiplayer gaming",
  "Music", "Political news", "Recipes",
  "Religious information","Restaurant reviews","Schoolwork", "Shopping",
  "Social media", "Sports news", "Travel reservations",
  "Video meetings", "Weather forecast"
)

# Set up the number of "respondents", tasks, and items (cards) per task
N.obs   <- 200
N.tasks <- 15
N.cards <- 5

## create a design matrix
#
# We use the flipMaxDiff package to create a balanced design matrix
# NOTE: the standard distribution gives errors as of writing time in 2022
#    (that is flipMaxDiff from GDelin on GitHub)
# instead we are using a forked build that works from pelishk@
#
# you can install that with:
#   devtools::install_github("pelishk/flipMaxDiff")
#
# cf. https://stackoverflow.com/questions/63904736/error-installing-flipmaxdiff-package-from-github
# for more information

library(flipMaxDiff)
Sys.time()
# WARNING: the following takes about 30 minutes on Chris's laptop
md.des <- MaxDiffDesign(length(online.tasks), N.tasks, N.cards,
                        n.versions=N.obs, n.repeats=1000, seed=98101)
Sys.time()


## define preferences for 4 groups of items & respondents
# 4 general groups: information; entertainment; school; base (all)
#
# this function will return part worths for one respondent
# assigned one of the 4 groups
#
# this is of no general value; we just do this to create "segments" in
# response style who have somewhat different sets of preferences
#
one.util <- function(tasks, group) {
  # define task mapping for each group
  info.tasks <- c(1, 3, 9, 10, 11, 12, 16, 18, 19)
  ent.tasks  <- c(2, 5, 6, 8, 15, 16)
  sch.tasks  <- c(2, 4, 8, 13, 15, 18)

  # create base part worths
  base.pws <- rnorm(length(tasks))

  # strengthen those for the tasks within defined group
  if (group=="entertainment") {
    base.pws[ent.tasks] <- base.pws[ent.tasks] + 1.5
  } else if (group=="school") {
    base.pws[ent.tasks] <- base.pws[ent.tasks] + 1.5
  } else if (group=="information") {
    base.pws[ent.tasks] <- base.pws[ent.tasks] + 1.5
  }

  # randomly jitter for any given individual
  base.pws <- base.pws + rnorm(length(base.pws), mean=0, sd=1)

  # zero-center them to be formally OK as part worths
  base.pws - sum(base.pws)/length(base.pws)
}

# test the function
sum(one.util(online.tasks, "all"))
cbind(online.tasks, one.util(online.tasks, "school"))
cbind(online.tasks, one.util(online.tasks, "information"))

# now let's generate N.obs (200) simulated responses
set.seed(04644)
util.df <- data.frame(matrix(data=NA,
                             nrow=N.obs, ncol=length(online.tasks)+2))
names(util.df) <- c("ID", gsub(" ", ".", online.tasks), "group")
str(util.df)

# add N.obs respondents
for (i in 1:N.obs) {
  i.group <- sample(c("entertainment", "school", "information", "all"), 1)
  pws     <- one.util(online.tasks, i.group)
  util.df[i, 1] <- i
  util.df[i, 2:(length(online.tasks)+1)] <- pws
  util.df[i, length(online.tasks)+2] <- i.group
}
# check the results
summary(util.df)

# check zero-sum within individuals
head(md.des$versions.design, 20)
head(util.df)
item.cols <- 2:20  # <<<<< Magic numbers that only match the items defined above
summary(rowSums(util.df[ , item.cols]))
# check overall average pws
colMeans(util.df[ , item.cols])
# check zero-sum across all pws
sum(colSums(util.df[ , item.cols]))


### Simulate individual answers for the design matrix (the tasks)

# demonstrate pw calculation for 1 respondent, 1 question
des.cols  <- 3:7
(md.des1 <- md.des$versions.design[1, des.cols])
(md.pws1  <- util.df[1, item.cols])
md.pws1[md.des1]
p.choice <- exp(md.pws1[md.des1]) / sum(exp(md.pws1[md.des1]))
sample(md.des1, 1, prob = p.choice)

# does our sampling work?
draw10000 <- replicate(10000, sample(md.des1, 1, prob = p.choice))
online.tasks[md.des1]
prop.table(table(draw10000))
p.choice

### now do the draws (task answers) for all respondents

# this is a helper function to get 1 choice for 1 question
# based on its design matrix and a respondent's part worth preferences
choose.md <- function(design, pws) {
  design <- as.numeric(design)
  # MNL preference for the options in the design
  p.choice <- exp(pws[design]) / sum(exp(pws[design]))
  best  <- NA
  worst <- NA
  # make sure we don't accidentally choose the same answer for
  # best and worst
  while (is.na(best) || (!is.na(best) && best==worst)) {
    # choose answers proportionally to MNL preference
    best  <- sample(design, 1, prob = p.choice)
    worst <- sample(design, 1, prob = 1-p.choice)
  }
  c(best, worst)
}

# iterate over all respondents and tasks
# first set a place to put the results
md.data <- data.frame(md.des$versions.design)
md.data$Best  <- NA
md.data$Worst <- NA
# next, iterate over the design matrix
# this relies on an assumption of exactly 1 design matrix
# per respondent and thus a 1:1 match
# (in other cases, might look up the appropriate design per respondent)
#
for (i in 1:nrow(md.data)) {
  md.des1 <- md.data[i, des.cols]
  md.pws1 <- util.df[md.data$Version[i], item.cols]
  choices <- choose.md(md.des1, md.pws1)
  md.data$Best[i] <- choices[1]
  md.data$Worst[i] <- choices[2]
}
# check it
head(md.data)
util.df[1, ]


# get a simple count estimation of the preferences
count.best  <- prop.table(table(md.data$Best))*100*N.tasks
count.worst <- prop.table(table(md.data$Worst))*100*N.tasks
count.net   <- count.best - count.worst
count.net

md.ests <- data.frame(Items=online.tasks,
                      Defined=colMeans(util.df[ , item.cols]),
                      Counts=as.numeric(count.net))
md.ests
# how well does the "count" estimate match the true part worths?
cor(md.ests$Defined, md.ests$Counts)


### QUALTRICS DATA SETUP
#
# this section converts the data above to a Qualtrics-formatted CSV
# this code might be useful for you if:
#   1. you have MaxDiff data from Qualtrics whose headers are messed up due to editing
#   2. you want to create Qualtrics-like data for some other reason (such as a demo, like here)
# note that it is purely experimental -- use at your OWN RISK and be sure you understand it
# and test for your data set
#
# Qualtrics formats are likely to change over time and this code may need updating,
# which is unfortunately outside the scope of the book.
#


# make Qualtrics header rows for CSV data set
#
# first row:  ITEM NAMES
# second row: TEXT
# third row:  Internal names

first.row <- c("ReponseID",
               paste0(rep(paste0("Q", 1:N.tasks),
                          each=length(online.tasks)),
                      "_",
                      rep(1:length(online.tasks),
                          times=N.tasks)),
               paste0("DO-Q-Q", 1:N.tasks))

second.row <-c("ResponseID",
               rep(paste0("Which are best and worst?%-%", online.tasks),
                   times=N.tasks),
               rep("Display Order: Which are best and worst?",
                   times=N.tasks))

third.row  <- c("{'ImportId': 'responseId'}",
                paste0(rep(paste0("{'ImportId': 'QID", 1:N.tasks),
                           each=length(online.tasks)),
                       "-",
                       rep(1:length(online.tasks),
                           times=N.tasks),
                       "'}"),
                paste0("{'ImportId': 'DO-Q-Q", 1:N.tasks, "'}"))

# c(length(first.row), length(second.row), length(third.row))
headers <- rbind(first.row, second.row, third.row)
# str(headers)

## reformat choice & design matrix from the tab format
## into Qualtrics wide format
wide.md <- data.frame(matrix(NA,
                             nrow = N.obs,
                             ncol = 1 +
                               length(online.tasks) * N.tasks +
                               N.tasks))   # ID + Items*Tasks + Task DOs
names(wide.md) <- c("ResponseID",
                    paste0(
                      paste0("MaxDiff_q", rep(1:N.tasks, each=length(online.tasks))),
                      "_i",
                      rep(1:length(online.tasks), times=N.tasks)),
                    paste0("MaxDiff_DO_", 1:N.tasks))

# loop over respondents -- relies on 1 version == 1 respondent
# alternative would be to look up assigned design matrix for each respondent
design.cols <- (length(online.tasks)*N.tasks + 2):
  (length(online.tasks)*N.tasks + N.tasks + 1)
for (i in 1:N.obs) {
  # loop over tasks within respondent
  design.block <- rep("", N.tasks)
  for (j in 1:N.tasks) {
    # form very wide extended format for choices
    task.data  <- md.data[(i-1)*N.tasks+(j-1)+1, ]
    task.block <- rep("", length(online.tasks))
    best.item  <- as.numeric(task.data[N.cards+3])
    worst.item <- as.numeric(task.data[N.cards+4])
    task.block[best.item] <- "2"
    task.block[worst.item] <- "1"

    # put those into the data block
    wide.md[i, 1] <- i
    wide.md[i, ((j-1)*length(online.tasks)+2):((j)*length(online.tasks)+1)] <- task.block

    # update design matrix
    task.design <- paste0(task.data[3:(2+N.cards)], collapse="|")
    wide.md[i, design.cols[j] ] <- task.design
  }
}

### write out the data to a Qualtrics style CSV
#
# if you ran the above code to simulate data, including the random seed,
# then this should be the exact data set used in the book

# putting inside an "if false" block to avoid accidentally overwriting data
if (FALSE) {
# first write the 3 CSV header rows
  write.table(headers, file="qualtrics-maxdiff-usecases.csv", sep=",",
              row.names=FALSE, col.names = FALSE)

# then append the simulated data in wide format
  write.table(wide.md, file="qualtrics-maxdiff-usecases.csv", sep=",",
              row.names=FALSE, col.names = FALSE, append=TRUE)
}