## set seed to make simulations reproducible
## set.seed(20210108)
## let's start with some parameter definitions
= 100
nsamp = 2
beta = 0.1
h2 = h2
sig2X = (1 - sig2X) * beta^2
sig2epsi = sqrt(sig2X)
sigX = sqrt(sig2epsi) sigepsi
Multiple Testing Vignette
Learning objectives
- build intuition about p-values when multiple testing is performed via simulations
- recognize the need for multiple testing correction
- present methods to correct for multiple testing
- Bonferroni correction
- FDR (false discovery rate)
Why do we need multiple testing correction
What do p-values look like under the null and alternative?
Simulate vectors X, Yalt=\(X\cdot \beta + \epsilon\) and Ynull independent of X
We start defining some parameters for the simulations. The need for these will become obvious later.
Next, we simulate a vectors X and \(\epsilon\), and Ynull, all normally distributed
= rnorm(nsamp,mean=0, sd= sigX)
X = rnorm(nsamp,mean=0, sd=sigepsi)
epsi ## generate Ynull (X has no effect on Ynull)
= rnorm(nsamp, mean=0, sd=beta) Ynull
Calculate Yalt = X * beta + epsi
= X * beta + epsi Yalt
Visualize Yalt vs X
plot(X, Yalt, main="Yalt vs X"); grid()
Visualize Ynull vs X
plot(X, Ynull, main="Ynull vs X");grid()
Test association between Ynull and X
summary(lm(Ynull ~ X))
Call:
lm(formula = Ynull ~ X)
Residuals:
Min 1Q Median 3Q Max
-5.3057 -1.3026 0.0433 1.6954 7.7384
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.4484 0.2082 -2.153 0.0337 *
X 0.9107 0.6959 1.309 0.1937
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.072 on 98 degrees of freedom
Multiple R-squared: 0.01717, Adjusted R-squared: 0.007145
F-statistic: 1.712 on 1 and 98 DF, p-value: 0.1937
what’s the p-value of the association?
is the p-value significant at 5% significance leve?
Next, test the association between Yalt and X
summary(lm(Yalt ~ X))
Call:
lm(formula = Yalt ~ X)
Residuals:
Min 1Q Median 3Q Max
-5.1010 -1.1054 0.3046 1.2652 4.4881
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.0170 0.1853 0.092 0.92708
X 1.6342 0.6191 2.640 0.00966 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.843 on 98 degrees of freedom
Multiple R-squared: 0.06637, Adjusted R-squared: 0.05685
F-statistic: 6.967 on 1 and 98 DF, p-value: 0.00966
what’s the p-value of the association?
is the p-value significant at 5% significance level?
Calculate the empirical distribution of p-values
To calculate the empirical distribution of p-values under the null and alternatives we will simulate X, Yalt, Ynull for 10,000 times.
Define a convenience function fastlm, will do linear regression much faster
We want to run 10,000 times this same regression, so here we define a function fastlm
that will get us the p-values and regression coefficients.
= function(xx,yy)
fastlm
{## compute betahat (regression coef) and pvalue with Ftest
## for now it does not take covariates
= 2
df1 = 1
df0 = !is.na(xx) & !is.na(yy)
ind = xx[ind]
xx = yy[ind]
yy = sum(ind)
n = mean(xx)
xbar = mean(yy)
ybar = xx - xbar
xx = yy - ybar
yy
= sum( xx^2 )
SXX = sum( yy^2 )
SYY = sum( xx * yy )
SXY
= SXY / SXX
betahat
= sum( ( yy - xx * betahat )^2 )
RSS1 = SYY
RSS0
= ( ( RSS0 - RSS1 ) / ( df1 - df0 ) ) / ( RSS1 / ( n - df1 ) )
fstat = 1 - pf(fstat, df1 = ( df1 - df0 ), df2 = ( n - df1 ))
pval = list(betahat = betahat, pval = pval)
res
return(res)
}
Simulate vectors X, Ynull, Yalt 10,000 times
= 10000
nsim ## simulate normally distributed X and epsi
= matrix(rnorm(nsim * nsamp,mean=0, sd= sigX), nsamp, nsim)
Xmat = matrix(rnorm(nsim * nsamp,mean=0, sd=sigepsi), nsamp, nsim)
epsimat
## generate Yalt (X has an effect on Yalt)
= Xmat * beta + epsimat
Ymat_alt
## generate Ynull (X has no effect on Ynull)
= matrix(rnorm(nsim * nsamp, mean=0, sd=beta), nsamp, nsim)
Ymat_null
## let's look at the dimensions of the simulated matrices
dim(Ymat_null)
[1] 100 10000
dim(Ymat_alt)
[1] 100 10000
Now we have 10000 independent simulations of X, Ynull, and Yalt
## give them names so that we can refer to them later more easily
colnames(Ymat_null) = paste0("c",1:ncol(Ymat_null))
colnames(Ymat_alt) = colnames(Ymat_null)
To calculate p-values under the null run 10,000 linear regressions using X and Ynull
= rep(NA,nsim)
pvec_null = rep(NA,nsim)
bvec_null
for(ss in 1:nsim)
{= fastlm(Xmat[,ss], Ymat_null[,ss])
fit = fit$pval
pvec_null[ss] = fit$betahat
bvec_null[ss]
}
summary(pvec_null)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00005 0.25479 0.50579 0.50281 0.75277 0.99999
hist(pvec_null,xlab="p-value",main="Histogram of p-values under Null")
- how many simulations under the null yield p-value below 0.05? What percentage is that?
sum(pvec_null<0.05)
[1] 480
mean(pvec_null<0.05)
[1] 0.048
how many simulations under the null yield p-value < 0.20?
what do you think the proportion of simulations with p-values < \(\alpha\) (\(\alpha\) between 0 and 1) will be roughly?
Why do we need to use more stringent significance level when we test many times?
Bonferroni correction
Use as the new threshold the original one divided by the number of tests. So typically
\[\frac{0.05}{\text{total number of tests}}\]
what’s the Bonferroni threshold for significance in this simulation?
how many did we find?
= 0.05/nsim
BF_thres ## Bonferroni significance threshold
print(BF_thres)
[1] 5e-06
## number of Bonferroni significant associations
sum(pvec_null<BF_thres)
[1] 0
## proportion of Bonferroni significant associations
mean(pvec_null<BF_thres)
[1] 0
Mix of Ynull and Yalt
Let’s see what happens when we add a bunch of true associations in the matrix of null associations
=0.20 ## define proportion of alternative Ys in the mixture
prop_alt= rbinom(nsim,1,prop_alt)
selectvec names(selectvec) = colnames(Ymat_alt)
1:10] selectvec[
c1 c2 c3 c4 c5 c6 c7 c8 c9 c10
0 1 0 0 0 0 0 0 0 0
= sweep(Ymat_alt,2,selectvec,FUN='*') + sweep(Ymat_null,2,1-selectvec,FUN='*') Ymat_mix
Run linear regression for all 10,000 phenotypes in the mix of true and false associations, Ymat_mix
= rep(NA,nsim)
pvec_mix = rep(NA,nsim)
bvec_mix for(ss in 1:nsim)
{= fastlm(Xmat[,ss], Ymat_mix[,ss])
fit = fit$pval
pvec_mix[ss] = fit$betahat
bvec_mix[ss]
}summary(pvec_mix)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.08682 0.38936 0.40822 0.69073 0.99999
hist(pvec_mix,xlab="p-value",main="Histogram of p-values under mixture of null and alt")
= sum(pvec_mix < 0.05) ## observed number of significant associations
m_signif = 0.05*nsim ## expected number of significant associations under the worst case scenario, where all features belong to the null
m_expected m_signif
[1] 2146
m_expected
[1] 500
Under the null, we were expecting 500 significant columns by chance but got 2146
Q: how can we estimate the proportion of true positives?
We got 1646 extra columns, so it’s reasonable to expect that the extra significant results come from the alternative distribution (Yalt). So \[\frac{\text{observed number of significant} - \text{expected number of significant}}{\text{observed number of significant}}\] should be a good estimate of the true discovery rate. False discovery rate is defined as 1 - the true discovery rate.
= 0.05
thres = sum((pvec_mix<thres & selectvec==0)) / sum(pvec_mix<thres)
FDR ## proportion of null columns that are significant among all significant
FDR
[1] 0.1766076
If we use a p-value threshold of 0.05, 82.34 percent of the signficant columns are true discoveries. In this case, we know which ones are true or false associations because we decided using the selectvec
vectors which simulated Y would be a function of X or unrelated to X.
what’s the proportion of false discoveries if we use a significance level of 0.01
what’s the proportion of false discoveries if we use Bonferroni correction as the significance level?
What’s the proportion of missed signals, proportion of true associations that have p-values greater than the Bonferroni threshold?
Common approaches to control type I errors
Assuming we are testing \(m\) hypothesis, let’s define the following terms for the different errors.
Called Significant | Called not significant | Total | |
---|---|---|---|
Null true | \(F\) | \(m_0 - F\) | \(m_0\) |
Alt true | \(T\) | \(m_1 - T\) | \(m_1\) |
Total | \(S\) | \(m - S\) | \(m\) |
- Bonferroni correction assures that the FWER (Familywise error rate) \(P(F \ge 1)\) is below the acceptable type I error, typically 0.05. \[P(F \ge 1) < \alpha. \] We achieve that by requiring that for each test \[p<\alpha/\text{# tests}.\] This can be too stringent and lead to miss real signals.
- pFDR (positive false discovery rate) \[E\left(\frac{F}{S} \rvert S>0\right)\]
- qvalue is the minimum false discovery rate attainable when the feature (SNP) is called significant
Table of null or alternative vs. significant or not significant
= t(table(pvec_mix>0.05, selectvec))
count_table colnames(count_table) = c("Called significant", "Called not significant")
rownames(count_table) = c("Null true", "Alt true")
::kable(count_table) knitr
Called significant | Called not significant | |
---|---|---|
Null true | 379 | 7646 |
Alt true | 1767 | 208 |
Let’s calculate the qvalue
Use qvalue package to calculate FDR and
Let’s check whether small qvalues correspond to true associations (i.e. the phenotype was generated under the alternative distribution)
## install qvalue if not available.
if(F) ## I set it to F now because I already installed the qvalue package
if (!require("BiocManager", quietly = TRUE))
{install.packages("BiocManager")
::install("qvalue")
BiocManager
}
## calculate qvalue using the qvalue function, which returns a list of values, we select the qvalue vector, which assigns the false discovery rate if the threshold for significance was the p-value of the same simulation vector
= qvalue::qvalue(pvec_mix)
qres_mix = qres_mix$qvalue
qvec_mix
= qvalue::qvalue(pvec_null)
qres_null = qres_null$qvalue
qvec_null
boxplot(qvec_mix~selectvec)
##plot(qvec_mix,col=selectvec*2 + 1, pch=selectvec + 1, lwd=selectvec*2 + 1)
## using selectvec*2 + 1 as a quick way to get the vector to be 1 and 3 (1 is black 3 is green) instead of 1 and 2 (2 is read and can be difficult to read for color blind people)
Plot sorted qvalues and color by the selectvec status (true association status)
=order(qvec_mix,decreasing=FALSE)
indplot(sort(qvec_mix),col=selectvec[ind]*2 + 1, pch=selectvec[ind] + 1, lwd=selectvec[ind]*2 + 1)
summary(qvec_mix)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000006 0.2808014 0.6296951 0.5054476 0.7448208 0.8087707
- do small qvalues tend to be true?
- interpret the figure
## distribution of qvalues and pvalues by causal status
boxplot(pvec_mix ~ selectvec, main='pvalue by causal status; 1=alt, 0=null')
boxplot(qvec_mix ~ selectvec, main='qvalue by causal status; 1=alt, 0=null')
- interpret these figures
How do qvalues and pvalues relate to each other?
plot(pvec_null,qvec_null,main='qvalue vs pvalue for null')
plot(pvec_mix,qvec_mix,main='qvalue vs pvalue for mixture of null and alt')
q-values are monotonic functions of p-values
- what’s the smallest qvalue when all simulations are from the null?
- what’s the smallest qvalue when all simulations are from the mixture?
pi0 and pi1
pi0 is the proportion of null hypothesis which can be estimated using the qvalue package 1 - pi1 is the proportion of true positive associations. This is a useful parameter as we will see in later classes.
$pi0 qres_null
[1] 1
$pi0 qres_mix
[1] 0.8087771
- how many true positive proportion did you expect given the simulations we performed?
References
Storey, John D., and Robert Tibshirani. 2003. “Statistical Significance for Genomewide Studies.” Proceedings of the National Academy of Sciences 100 (16): 9440–45.