# 9/16/05
# R Script for combining "Single Response", "Multiple Response"
# and "Text Response" files.  These files are collected by a QSurvey
# survey on a Zope server (i.e. http://web2survey.unt.edu/ )
#
# Download these files from your Zope RESULTS folder into a
# directory on your local C: drive in a folder called c:\temp
# Tutorials on using QSurvey and downloading data from collected
# survey results can be found at(review AVI video #9):
#
#  http://www.unt.edu/benchmarks/archives/2005/june05/rss.htm
#
# Download three types of data sets: single response, multiple
# response and text response.

# e.g. c:\temp\asTsvSingle
#      c:\temp\asTsvMultiple
#      c:\temp\asTsvText
#
# These files are subsequently called:
#
#    "mdata.part1" - single response "forced choice & other" data
#    "mdata.part2" - multiple response data
#    "mdata.part3" - single response text data


mdata.part1<-read.delim("c:/temp/asTsvSingle",quote="")
mdata.part2<-read.delim("c:/temp/asTsvMultiple",quote="")
mdata.part3<-read.delim("c:/temp/asTsvText",quote="")


# Break apart the string sequence which contains "id" information

id.matrix1<-matrix(unlist(strsplit(as.character(mdata.part1$id), "_")), ncol=8, byrow=T)
id.matrix2<-matrix(unlist(strsplit(as.character(mdata.part2$id), "_")), ncol=8, byrow=T)
id.matrix3<-matrix(unlist(strsplit(as.character(mdata.part3$id), "_")), ncol=8, byrow=T)

# Calculate the number of rows for each data type

id.number1<-1:nrow(id.matrix1)
id.number2<-1:nrow(id.matrix2)
id.number3<-1:nrow(id.matrix3)

# Combine the "exploded" string information with the original data
# into one data.frame

mdata.text.to.combine.dat<-data.frame(cbind(id.number3, id.matrix3, mdata.part3))

# Rename three of the columns
names(mdata.text.to.combine.dat)[c(1,9,10)]<-c("rec.num","id","full.id")

mdata.single.to.combine.dat<-data.frame(cbind(id.number1, id.matrix1, mdata.part1))

# Rename three of the columns
names(mdata.single.to.combine.dat)[c(1,9,10)]<-c("rec.num","id","full.id")

# Create two seperate data.frames for frequency tabulation for
# Single response and Multiple response data

# First, prepare Single response data for tabulation

mdata.single.dat<-data.frame(cbind(id.number1, id.matrix1, mdata.part1))
col.size<-ncol(mdata.single.dat)

mdata.single.dat<-mdata.single.dat[, c(1, 9, 10, 12:col.size)]
names(mdata.single.dat)[1:3]<-c("rec.num","id","full.id")

# Now prepare Multiple response data

mdata.multiple.dat<-data.frame(cbind(id.number2, id.matrix2, mdata.part2))
mdata.multiple.dat<-mdata.multiple.dat[, c(1, 9, 10, 11, 12)]

names(mdata.multiple.dat)<-c("rec.num", "id", "full.id", "page.question", "response")

# Calculate information that is needed for unstacking multiple
# response data - will need to unstack before we tabulate

mdata.ids<-as.vector(unique(mdata.multiple.dat$id))
mdata.id.split<-split(as.character(mdata.multiple.dat$id),
                     f=as.character(mdata.multiple.dat$id))

# Calculate a sequence for a set of records within each "id"
# We will need this to unstack the data into unique variable names

mdata.id.seq<-lapply(mdata.id.split, seq)

# Now stack the sequences (that were just created) - this will
# range from 1 to the number of records for a particular "id" value.
# Again, we will need this variable to unstack the multiple response data

temp<-0
for (i in 1:length(mdata.id.seq)){
    temp<-c(temp,mdata.id.seq[[mdata.ids[i]]])
}

# Strip off temp value from top of the sequence column (i.e. zero)

col.seq.ids<-temp[2:length(temp)]

attach(mdata.multiple.dat)

# Combine the question type info (e.g. "P1q2") with the actual response
# into one column.  This will save ALOT of space in the final
# merged data.frame! We can tabulate the type of question and its
# response together as a single "response type"

response.type<-paste(mdata.multiple.dat$page.question, mdata.multiple.dat$response)

# Create a new data.frame with renamed columns.

mdata.mult.dat<-data.frame(col.seq.ids, id, full.id, response.type)

# Finally!! Now unstack multiple response data into a "wide" format

mdata.mult<-reshape(mdata.mult.dat, timevar=c("col.seq.ids"),
           idvar=c("id","full.id"), direction="wide")

# Rename the row names

row.names(mdata.mult)<-1:nrow(mdata.mult)

# Now what we've all been waiting for....
# Print out frequency tables for multiple response data

ncol.mdata.mult<-ncol(mdata.mult)
freq.response.type<-unlist(apply(mdata.mult[,2:ncol.mdata.mult], 2, table))
freq.response.type.df<-data.frame(freq.response.type)
names(freq.response.type.df)<-c("freq.response.type")

freq.response.type.df

# Print out frequency tables for single response data

table.freq.single<-unlist(apply(mdata.single.dat[,3:ncol(mdata.single.dat)], 2, table))
table.freq.single.df<-data.frame(table.freq.single)

table.freq.single.df

#### In this next section we merge all three types of
#### data (i.e. single response, multiple response, and text)
#### into one data.frame for export into an .xls format

# Check to make sure records match up for each of the three records

head(mdata.single.to.combine.dat, n=1)
head(mdata.mult, n=1)
head(mdata.text.to.combine.dat, n=1)

# Merge "Single Response" and "Multiple Response" records together by
# variable "id"

mdata.combined.s.m.dat<-merge(mdata.single.to.combine.dat, mdata.mult,
                     by=c("id","full.id"), all=TRUE, sort=FALSE)

# Merge previous merged file (single and multiple) with "Text Response"
# records

# Duplicate columns from the merged files will have variable names appended
# with ".x" and ".y"

mdata.combined.s.m.t.dat<-merge(mdata.combined.s.m.dat, mdata.text.to.combine.dat,
                       by=c("id","full.id"), all=TRUE, sort=FALSE)


############ Clean the data.frame up a bit

# Clean leading and trailing spaces

trim<- function(s){
    s <- sub("^ +", "", s)
    s <- sub(" +$", "", s)
    s}

# This function will examine each cell and trim leading and trailing spaces

clean.data<-function(data){
   x<-data
   data<-if(is.character(data)){
      x<-trim(x)
      return(x)}
   data<-if(!is.character(data)){return(x)}
}

# Apply the clean.data function to the data

mdata<-data.frame(apply(mdata.combined.s.m.t.dat, c(1,2), clean.data))

# Finished processing data.
# Write file out to a "tab-delimited" file with variable names on line 1

write.table(mdata, "c:/temp/mdata.txt", sep="\t", col.names=TRUE,
            row.names=FALSE, quote=FALSE, na="")