################################################################### ### Hemant Ishwaran ################################################################### ### Plot ROC curve for ordinal categorical data (works for ### arbitrary number of categores, thus applies to "continuous" ### ordinal categorical data) ### ### Function does: ### ------------- ### -- Plot empirical ROC curve and compute area under the ### curve using trapezoidal rule ### -- Plot CDF for diseased and non-diseased groups using the ### same axis ### -- Plot back to back probability histograms for ### diseased and non-diseased groups ### ### ### Example 1 Degree of suspicion of stage C/D prostate ### --------- cancer on ordinal scale {1,2,...100} ### ### -- true disease status (gold standard information) ### -- value of 1 corresponds to a diseased patient: ### ### group <- c( ### 1,0,1,0,1,1,0,1,1,1,1,0,1,1,0,0,1,1,1,0,1,1,0,0,1,0,1,0,1,0,0,1,0,1,0, ### 0,0,1,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,1,1,0,1,1, ### 1,1,0,1,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,1,1,1,1,1,1,1) ### ### -- ordinal value for degree of suspicion: ### ### ordinal <- c( ### 99,5,40,20,10,30,5,50,5,25,50,5,60,10,5,5,15,15,5,5,40,60,5,5,20,5,60, ### 5,15,5,5,30,25,5,5,5,5,60,40,5,5,10,10,5,0,40,5,50,5,5,5,70,5,50,0,5, ### 30,10,20,60,5,10,5,10,5,70,10,15,40,10,80,40,5,60,5,60,5,10,10,50,15, ### 5,60,20,70,10,5,30,10,50,10,60,5,40,50,80,5,15,25) ### ### -- plot roc curves (use mtxt for custom titles): ### ### rocPlot(group,ordinal,mtxt="ROC plot for prostate data") ### ### ### Example 2 Randomly generated continuous data ### --------- ### ### rocPlot(rep(c(0,1),c(50,50)),rnorm(100,rep(c(90,100),c(50,50)),10)) ### ### ------------------------------------------------------------- ### Hemant Ishwaran ishwaran@bio.ri.ccf.org ### Dept of Biostatistics/Wb4 phone: 216-444-9932 ### Cleveland Clinic Foundation fax: 216-445-2781 ### 9500 Euclid Avenue ### Cleveland, OH 44195 ### ### http://www.bio.ri.ccf.org/Resume/Pages/Ishwaran ################################################################### rocPlot <- function(group,ordinal, nbreaks=15,minAtom.plot=25, mtxt="ROC Plots",...) { group.uniq <- sort(unique(group)) neg <- ordinal[group==group.uniq[1]] pos <- ordinal[group==group.uniq[2]] n.neg <- length(neg) n.pos <- length(pos) range.data <- range(neg,pos) xtxt <- 'Test Results' ### Find all distinct atoms generated by pos and neg ### test values. atoms <- unique(sort(c(neg,pos))) atoms.neg <- unique(sort(neg)) atoms.pos <- unique(sort(pos)) n.atoms <- length(atoms) tp <- rep(0,n.atoms) fp <- rep(0,n.atoms) cdf.neg <- rep(0,length(atoms.neg)) cdf.pos <- rep(0,length(atoms.pos)) ### (1) Compute tp and fp values for each distinct atom ### (2) Compute CDF's for (i in 1:n.atoms){ tp[i] <- sum(pos >= atoms[i])/n.pos fp[i] <- sum(neg >= atoms[i])/n.neg } for (i in 1:length(atoms.neg)){ cdf.neg[i] <- sum(neg <= atoms.neg[i])/n.neg } for (i in 1:length(atoms.pos)){ cdf.pos[i] <- sum(pos <= atoms.pos[i])/n.pos } ### Nice touch is to add end values for tp,fp values and cdf's tp <- c(1,tp,0) fp <- c(1,fp,0) delta <- (range.data[2]-range.data[1])/20 atoms.neg <- c(atoms[1]-delta,atoms.neg,atoms[n.atoms]+delta) atoms.pos <- c(atoms[1]-delta,atoms.pos,atoms[n.atoms]+delta) cdf.neg <- c(0,cdf.neg,1) cdf.pos <- c(0,cdf.pos,1) ### (1) Calculate area under curve using Wilcoxon U-statistic ### (2) Estimate its standard error tp.mean <- (tp[1:(n.atoms+1)] + tp[2:(n.atoms+2)])/2 fp.diff <- -diff(fp) area.U <- sum(fp.diff*tp.mean) area.S <- sqrt((area.U*(1 - area.U)+(n.pos - 1)*(area.U/(2 - area.U) - area.U^2) + (n.neg - 1)*((2*area.U^2)/(1 + area.U) - area.U^2))/(n.pos*n.neg)) ### (1) Plot Empirical ROC. Use points and segments if there are ### fewer than minAtom.plot distinct values ### (2) Plot probability histograms for positive and negative ### test values on same axis for comparison. Use points and ### segments if there are fewer than minAtom.plot values ### (3) Plot CDF for test values par(pty="s",mfrow=c(1,3)) if(n.atoms>=minAtom.plot){ plot(fp,tp,type="l",xlim=c(0.0,1.0), ylim=c(0.0,1.0),xlab="False Positive Ratio", ylab="Sensitivity",main="Empirical ROC") text(0.7,0.1,paste('Area=',format(round(area.U,3)), '+/-',format(round(area.S,3)))) } else{ plot(fp,tp,type="b",xlim=c(0.0,1.0), ylim=c(0.0,1.0),xlab="False Positive Ratio", ylab="Sensitivity",main="Empirical ROC") text(0.7,0.1,paste('Area=',format(round(area.U,3)), '+/-',format(round(area.S,3)))) } if(n.atoms>=minAtom.plot){ plot(atoms.neg,cdf.neg,type="s",lty=1, xlim=range(atoms.neg,atoms.pos), ylim=range(cdf.neg,cdf.pos), xlab=xtxt,ylab="Cumulative Probabilities", main="CDF Plot") par(new=T) plot(atoms.pos,cdf.pos,type="s",lty=2, xlab="",ylab="") } else{ plot(atoms.neg,cdf.neg,type="s",lty=1, xlim=range(atoms.neg,atoms.pos), ylim=range(cdf.neg,cdf.pos), xlab=xtxt,ylab="Cumulative Probabilities", main="CDF Plot") rug(atoms) par(new=T) points(atoms.neg,cdf.neg,pch="x") par(new=T) plot(atoms.pos,cdf.pos,type="s",lty=2, xlab="",ylab="") par(new=T) points(atoms.pos,cdf.pos,type="p",pch="o") } # Mimic a probability histogram, using barplot # Back to back plots. nbreaks <- min(nbreaks,max(n.atoms,range.data[2]-range.data[1])) data.bks <- seq(range.data[1]-.001,range.data[2]+.001, length=nbreaks+2) temp.neg <- hist(neg,breaks=data.bks,plot=F) pct.neg <- temp.neg$count temp.pos <- hist(pos,breaks=data.bks,plot=F) pct.pos <- temp.pos$count barplot(rbind(pct.pos,-pct.neg), yaxt="n",srt=90, space=0,angle=c(5,45),density=c(15,30), xlab=xtxt, ylab="Counts",main="Histogram") junk.fit <- lm(seq(0.5, length(data.bks), by=1) ~ data.bks) pretty.x <- coef(junk.fit) %*% t(cbind(1,pretty(data.bks,10))) nice.probs <- pretty(range(c(pct.pos,-pct.neg))) axis(1,at=pretty.x, labels=pretty(data.bks,10),ticks=0) axis(2,at=nice.probs,lab=abs(nice.probs)) # Overall title mtext(mtxt,side=3,line=0,cex=2,outer=TRUE) }