ICLUST.cluster <- function (r.mat,ICLUST.options) {#should allow for raw data, correlation or covariances #options: alpha =1 (minimum alpha) 2 (average alpha) 3 (maximum alpha) # beta =1 (minimum beta) 2 (average beta) 3 (maximum beta) # correct for reliability # reverse score items if negative correlations # stop clustering if beta for new clusters < beta.min # output =1 (short) 2 (show steps) 3 show rejects as we go # #initialize various arrays and get ready for the first pass output <- ICLUST.options$output num.var <- nrow(r.mat) keep.clustering <- TRUE #used to determine when we are finished clustering results <- data.frame(matrix(rep(0,18*(num.var-1)),ncol=18)) names(results) <- c("Item/Cluster", "Item/Clust","similarity","correlation","alpha1","alpha2", "beta1","beta2","size1","size2","rbar1","rbar2","r1","r2","alpha","beta","rbar","size") rownames(results) <- paste("C",1:(num.var-1),sep="") digits <- ICLUST.options$digits clusters <- diag(1,nrow =nrow(r.mat)) #original cluster structure is 1 item clusters if(is.null(rownames(r.mat))) {rownames(r.mat) <- paste("V",1:num.var,sep="") } rownames(clusters) <- rownames(r.mat) colnames(clusters) <- paste("V",1:num.var,sep="") diag(r.mat) <- 0 row.range <- apply(r.mat,1,range,na.rm=TRUE) item.max<- pmax(abs(row.range[1,]),abs(row.range[2,])) #find the largest absolute similarity diag(r.mat) <- 1 count=1 #master loop while (keep.clustering) { #loop until we figure out we should stop #find similiarities #we will do most of the work on a copy of the r.mat cluster.stats <- cluster.cor(clusters,r.mat,FALSE,digits=9,SMC=ICLUST.options$SMC) sim.mat <- cluster.stats$cor #the correlation matrix diag(sim.mat) <- 0 #we don't want 1's on the diagonal to mess up the maximum #two ways to estimate reliability -- for 1 item clusters, max correlation, for >1, alpha #this use of initial max should be an option if (ICLUST.options$correct) { #find the largest and smallest similarities for each variable row.range <- apply(sim.mat,1,range,na.rm=TRUE) row.max <- pmax(abs(row.range[1,]),abs(row.range[2,])) #find the largest absolute similarity } else {row.max <- rep(1, nrow(sim.mat)) } #don't correct for largest similarity item.rel <- cluster.stats$alpha for (i in 1: length(item.rel)) { if (cluster.stats$size[i]<2) { item.rel[i] <- row.max[i] #figure out item betas here? }} if(output>3) print(sim.mat) sq.max <- diag(1/sqrt(item.rel)) #used to correct for reliabilities #this is the corrected for maximum r similarities if (ICLUST.options$correct) {sim <- sq.max %*% sim.mat %*% sq.max #this corrects for reliabilities but messes up the correlations of two item clusters with items } else {sim <- sim.mat} diag(sim) <- NA #we need to not consider the diagonal when looking for maxima #find the most similar pair and apply tests if we should combine test.alpha <- FALSE test.beta <- FALSE while(!(test.alpha&test.beta)){ max.cell <- which.max(sim) #global maximum if (length(max.cell) < 1) { keep.clustering <- FALSE break} #there are no non-NA values left sign.max <- 1 if ( ICLUST.options$reverse ) { #normal case is to reflect if necessary min.cell <- which.min(sim) #location of global minimum if (sim[max.cell] < abs(sim[min.cell] )) { sign.max <- -1 max.cell <- min.cell } if (sim[max.cell] < 0.0) {sign.max <- -1 }} #this is a weird case where all the similarities are negative -- happens towards the end of clustering max.col <- trunc(max.cell/nrow(sim))+1 #is in which row and column? max.row <- max.cell - (max.col-1)*nrow(sim) #need to fix the case of first column if (max.row < 1) {max.row <- nrow(sim) max.col <- max.col-1 } size1 <- cluster.stats$size[max.row] if(size1 < 2) {V1 <- 1 beta1 <- item.rel[max.row] alpha1 <- item.rel[max.row] rbar1 <- item.rel[max.row] } else { rbar1 <- results[cluster.names[max.row],"rbar"] beta1 <- results[cluster.names[max.row],"beta"] alpha1 <- results[cluster.names[max.row],"alpha"] V1 <- size1 + size1*(size1-1) * rbar1 } size2 <- cluster.stats$size[max.col] if(size2 < 2) {V2 <- 1 beta2 <- item.rel[max.col] alpha2 <- item.rel[max.col] rbar2 <- item.rel[max.col] } else { rbar2 <- results[cluster.names[max.col],"rbar"] beta2 <- results[cluster.names[max.col],"beta"] alpha2 <- results[cluster.names[max.col],"alpha"] V2 <- size2 + size2 * (size2-1) * rbar2} Cov12 <- sign.max * sim.mat[max.cell] * sqrt(V1*V2) #this flips the sign of the correlation for negative correlations r12 <- Cov12/(size1*size2) #average between cluster r V12 <- V1 + V2 + 2 * Cov12 #the variance of the new cluster size12 <- size1 + size2 V12c <- (V12 - size12)*(size12/(size12-1)) #true variance (using the average r on the diagonal) rbar <- V12c/(size12^2) alpha <- V12c/V12 #combine these two rows if the various criterion are passed #beta.weighted <- size12^2 * sign.max *r12/V12 #this was added June, 2009 but can produce negative betas beta.weighted <- size12^2 *r12/V12 #corrected July 28, 2009 beta.unweighted <- 2* sign.max*sim.mat[max.cell]/(1+sign.max* sim.mat[max.cell]) if(ICLUST.options$weighted) {beta.combined <- beta.weighted} else {beta.combined <- beta.unweighted} #what is the correlation of this new cluster with the two subclusters? #this considers item overlap problems #There are actually two alternative solutions #a) (cor.gen=TRUE) finds the correlation due to a shared general factor #b) (cor.gen=FALSE) finds the correlation for the general + group but remove the item overlap problem #neither seems optimal, in that a) will correctly identify non-correlated clusters, but b) is less affected by small clusters. if(ICLUST.options$cor.gen) { c1 <- r12 * size1 * size1 + Cov12 #corrected covariance c2 <- sign.max*(r12 * size2 * size2 + Cov12) } else { c1 <- size1^2* rbar1 + Cov12 c2 <- sign.max*(size2^2 *rbar2 + Cov12) } if((size1 < 2) && (size2 < 2)) { #r2 should be flipped if necessary -- r2 is always flipped (if necessary) when forming clusters r1 <- sqrt(abs(rbar1)) #this corrects for reliability in a two item cluster r2 <- sign.max* r1 #flips the sign if two are negatively correlated -- in the case of two items } else { #this next part corrects for item overlap as well as reliability of the subcluster if (ICLUST.options$correct.cluster) { #correct is the default option if(TRUE) {r1 <- c1/sqrt((V1 - size1 +size1 * rbar1) * V12) if (size2 < 2) { r2 <- c2/sqrt(abs(rbar2)*V12)} else { # r2 <- sign.max * c2/sqrt((V2-size2 + size2 * rbar2)*V12c)} #changed yet again on 6/10/10 r2 <- c2/sqrt((V2-size2 + size2 * rbar2)*V12c)} } else { if(size1 < 2 ) { r1 <- c1/sqrt(abs(rbar1)*V12)} else { r1 <- c1/sqrt((V1-size1 + size1 * rbar1)*V12c) } #flip the smaller of the two clusters -- no, flip r2 if (size2 < 2) {r2 <- c2/sqrt(abs(rbar2)*V12)} else { r2 <- c2/sqrt((V2-size2 + size2*rbar2)*V12c)} # r2 <- c2/sqrt((V2-size2+size2*rbar2)*V12c) } } else {if(TRUE) {r1 <- c1/sqrt(V1*V12) #do not correct r2 <- sign.max* c2/sqrt(V2*V12) } else { r1 <-sign.max* c1/sqrt(V1*V12) } #flip the smaller of the two clusters - flip r2 r2 <- c2/sqrt(V2*V12) } } #test if we should combine these two clusters #first, does alpha increase? test.alpha <- TRUE if (ICLUST.options$alpha>0) { #should we apply the alpha test? if (ICLUST.options$alpha.size < min(size1,size2)) { switch(ICLUST.options$alpha, {if (alpha < min(alpha1,alpha2)) {if (output>2) {print( paste ('do not combine ', cluster.names[max.row],"with", cluster.names[max.col], 'new alpha =', round (alpha,digits),'old alpha1 =',round( alpha1,digits),"old alpha2 =",round(alpha2,digits)))} test.alpha <- FALSE }}, {if (alpha < mean(alpha1,alpha2)) {if (output>2) {print(paste ('do not combine ', cluster.names[max.row],"with", cluster.names[max.col],'new alpha =', round (alpha,digits), 'old alpha1 =',round( alpha1,digits),"old alpha2 =",round(alpha2,digits)))} test.alpha <- FALSE }}, {if (alpha < max(alpha1,alpha2)) {if (output>2) {print(paste ('do not combine ', cluster.names[max.row],"with", cluster.names[max.col],'new alpha =', round (alpha,digits), 'old alpha1 =',round( alpha1,digits),"old alpha2 =",round(alpha2,digits)))} test.alpha <- FALSE }}) #end switch } #end if options$alpha.size } #second, does beta increase ? test.beta <- TRUE if (ICLUST.options$beta>0) { #should we apply the beta test? if (ICLUST.options$beta.size < min(size1,size2)) { switch(ICLUST.options$beta, {if (beta.combined < min(beta1,beta2)) {if (output>2) {print( paste ('do not combine ', cluster.names[max.row],"with", cluster.names[max.col],'new beta =', round (beta.combined,digits),'old beta1 =',round( beta1,digits),"old beta2 =",round(beta2,digits)))} test.beta <- FALSE }}, {if (beta.combined < mean(beta1,beta2)) {if (output>2) {print(paste ('do not combine ', cluster.names[max.row],"with", cluster.names[max.col],'new beta =', round (beta.combined,digits), 'old beta1 =',round( beta1,digits),"old beta2 =",round(beta2,digits)))} test.beta <- FALSE }}, {if (beta.combined < max(beta1,beta2)) {if (output>2) {print(paste ('do not combine ', cluster.names[max.row],"with", cluster.names[max.col],'new beta =', round (beta.combined,digits), 'old beta1 =',round( beta1,digits),"old beta2 =",round(beta2,digits)))} test.beta <- FALSE }}) #end switch } #end if options$beta.size } if(test.beta & test.alpha) { break} else { #we have failed the combining criteria if((ICLUST.options$n.clus > 0) & ((num.var - count ) >= ICLUST.options$n.clus) ) {warning ("Clusters formed as requested do not meet the alpha and beta criteria. Perhaps you should rethink the number of cluster settings.") break } else { if (beta.combined < ICLUST.options$beta.min) { keep.clustering <- FALSE #the most similiar pair is not very similar, we should quit break} else { sim[max.row,max.col] <- NA sim[max.col,max.row] <- NA }} } #end of test.beta & test.alpha } #end of while test.alpha & test.beta.loop #combine and summarize if (keep.clustering) { # we have passed the alpha and beta tests, now combine these two variables clusters[,max.row] <- clusters[,max.row] + sign.max * clusters[,max.col] cluster.names <- colnames(clusters) #summarize the results results[count,1] <- cluster.names[max.row] results[count,2] <- cluster.names[max.col] results[count,"similarity"] <- sim[max.cell] results[count,"correlation"] <- sim.mat[max.cell] results[count,"alpha1"] <- item.rel[max.row] results[count,"alpha2"] <- item.rel[max.col] size1 <- cluster.stats$size[max.row] size2 <- cluster.stats$size[max.col] results[count,"size1"] <- size1 results[count,"size2"] <- size2 results[count,"beta1"] <- beta1 results[count,"beta2"] <- beta2 results[count,"rbar1"] <- rbar1 results[count,"rbar2"] <- rbar2 results[count,"r1"] <- r1 results[count,"r2"] <- r2 results[count,"beta"] <- beta.combined results[count,'alpha'] <- alpha results[count,'rbar'] <- rbar results[count,"size"] <- size12 #update cluster.names[max.row] <- paste("C",count,sep="") colnames(clusters) <- cluster.names clusters <- clusters[,-max.col] cluster.names <- colnames(clusters) #row.max <- row.max[-max.col] } #end of combine section if(output > 1) print(results[count,],digits=digits) count=count+1 if ((num.var - count) < ICLUST.options$n.clus) {keep.clustering <- FALSE} if(num.var - count < 1) {keep.clustering <- FALSE} #only one cluster left } #end of keep clustering loop #make clusters in the direction of the majority of the items #direct <- -(colSums(clusters) < 0 ) #clusters <- t(diag(direct) %*% t(clusters)) #colnames(clusters) <- cluster.names ICLUST.cluster <- list(results=results,clusters=clusters,number <- num.var - count) } # end ICLUST.cluster #modified June 12, 2008 to calculate the item-cluster correlation for cluster of size 2 #modified June 14, 2009 to find weighted or unweighted beta #unweighted had been the default option before but it would seem that weighted makes more sense #modified June 5, 2010 to correct the graphic tree paths