compare ratxcan with and without loco

compare ratxcan mixed effects with and without loco

Author

Haky Im

Published

August 29, 2024

load libraries and functions

Code

#options(error=recover)
#options(error=browser)
options(error=NULL)

## compare observed correlation with null correlation
suppressMessages(devtools::source_gist("a925fea01b365a8c605e")) ## load qqR fn https://gist.github.com/hakyim/a925fea01b365a8c605e
suppressMessages(devtools::source_gist("38431b74c6c0bf90c12f")) ## qqunif https://gist.github.com/hakyim/38431b74c6c0bf90c12f
suppressMessages(devtools::source_gist("115403f16bec0a0e871f3616d552ce9b")) ## source ratxcan functions https://gist.github.com/hakyim/115403f16bec0a0e871f3616d552ce9b 

suppressMessages(library(tidyverse))
suppressMessages(library(glue))
suppressMessages(library(RSQLite))

Warning: package 'RSQLite' was built under R version 4.3.3

Code

#suppressMessages(library(expm))
#suppressMessages(library(readxl))
# install.packages("devtools")
# library("devtools")
# install_github("jdstorey/qvalue")
suppressMessages(library(qvalue))
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("biomaRt")
##suppressMessages(library(biomaRt))
##install.packages("ggrepel")
suppressMessages(library(ggrepel))


recalc=FALSE

define data and software paths for R

Code

WEBDATA="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"

PLINK="/Users/haekyungim/bin/plink_mac_20231211/plink"
GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64"

INPUT <- glue("{WEBDATA}/ratxcan-tutorial") ## this has the input data 
OUTPUT <- glue("{WEBDATA}/2023-11-27-ratxcan-tutorial/scratch") ## this has the output data, intermediate results
GENO = glue("{WEBDATA}/2023-11-27-ratxcan-tutorial/data/genotype/")


OUT_TEMPO = glue("{OUTPUT}/testing-2024-08-29")

define data and software for the terminal

Code

WEBDATA="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"

PLINK="/Users/haekyungim/bin/plink_mac_20231211/plink"
GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64"
  
INPUT=$WEBDATA/ratxcan-tutorial
OUTPUT=$WEBDATA/2023-11-27-ratxcan-tutorial/scratch
GENO=$WEBDATA/2023-11-27-ratxcan-tutorial/data/genotype/

OUT_TEMPO=$OUTPUT/testing-2024-08-29

read grm matrix

Code

grm_mat <- read_GRMBin(glue("{OUTPUT}/rat6k_autosome.grm"))

define myplot

Code

myplot <- function(tempres, post_titulo="",semilla="") {
  # Create a data frame with specific columns
  df <- data.frame(
    p0.01_yes = apply(tempres$pmat_correct, 2, function(x) mean(x < 0.01)),
    p0.01_no = apply(tempres$pmat_raw, 2, function(x) mean(x < 0.01)),
    p0.05_yes = apply(tempres$pmat_correct, 2, function(x) mean(x < 0.05)),
    p0.05_no = apply(tempres$pmat_raw, 2, function(x) mean(x < 0.05)),
    p0.10_yes = apply(tempres$pmat_correct, 2, function(x) mean(x < 0.10)), 
    p0.10_no = apply(tempres$pmat_raw, 2, function(x) mean(x < 0.10))
    # ... [rest of your code for creating df] ...
  )

# Pivot the data frame to long format, specifying the columns to keep
df_long <- pivot_longer(df, cols = starts_with("p"))

df_long <- df_long %>% separate(name,into = c("threshold","corrected"),sep="_") %>% rename(proportion=value)

# Rename the name column to replace p0.xx with p<0.xx
df_long <- df_long %>%
  mutate(threshold = gsub("p0\\.", "p<0.", threshold))

  # Create boxplots with mean
  pp <- ggplot(df_long, aes(x = threshold, y = proportion, fill = corrected)) +
    geom_boxplot(alpha = 0.6) +
    stat_summary(fun = mean, geom = "point", shape = 3, size = 2, stroke = 2, color = "blue",                  #position = position_dodge(width = 0.8)) +
                 position = position_dodge(width = -0.1)) +
    #stat_summary(fun = mean, geom = "crossbar",  size = .5, color = "blue") +
    #stat_summary(fun = mean, geom = "crossbar",  size = .5, color = "darkgray") +
    geom_hline(yintercept = c(0.01, 0.05, 0.10), linetype = "dashed", color = "gray") +
    theme_minimal(base_size = 17) +
    ggtitle(glue("Type I Error Calibration {semilla} {post_titulo}")) +
    xlab("significance") + ylab("false positive rate")

  pp
}

NEED DEBUG simulate \(Y = Sigma^{1/2}\epsilon\) and run assoc with expr_mat

Code

nsam=nrow(grm_mat)
#ind=1:nsam
ind=1:1000
test_mat = grm_mat[ind,ind]
nsam=nrow(test_mat)
Sigma = test_mat * h2 + (1 - h2) * diag(rep(1,nsam))
Sig_eigen = eigen(Sigma)
rownames(Sig_eigen$vectors) = rownames(Sigma)
##sighalf = Sig_eigen$vectors %*% diag( sqrt(  Sig_eigen$values  ) ) %*% t(Sig_eigen$vectors)
## make this multiplication more efficient using sweep
sighalf = Sig_eigen$vectors %*% sweep(t(Sig_eigen$vectors),1,sqrt(  Sig_eigen$values ),"*")

sim_sigma_pheno = sighalf %*% matrix(rnorm(nsam * nsim), nsam, nsim) 
sim_sigma_pheno=cbind(FID=rownames(sim_sigma_pheno),IID=rownames(sim_sigma_pheno),as.data.frame(sim_sigma_pheno))


tic=Sys.time()
tempres_sigma_pheno <- lmmGRM(sim_sigma_pheno,grm_mat, h2,pred_expr,pheno_id_col=1, pheno_value_cols=2+(1:nsim))
toc=Sys.time()
toc - tic
pp<-myplot(tempres_sigma_pheno,post_titulo = glue("sigma_pheno n= {nsam} - ii={ii}"))
cat(ii,"\n")
print(pp)

read gene annotation

Code

#gene_annotation <- readRDS(glue("{INPUT}/data/expression/gene_annotation.RDS"))
gene_annotation <- readRDS(glue("{WEBDATA}/2023-11-27-ratxcan-tutorial/data/expression/gene_annotation.RDS"))

read predicted expression

Code

read_pred_expr = function(filename)
{
  ##usage: Br_pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))
  pred_expr <- vroom::vroom(filename) %>% 
  select(-FID) %>%  # Remove the FID column
  mutate(IID = str_split(IID, "_", simplify = TRUE)[, 1])  # Keep the first part of IID
  pred_expr
}

# pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))
# ## WHYYYY br-hki-rat6k__predict.txt has no chr 1????
# df1= tibble(gene=names(pred_expr)) %>% mutate(isinac=TRUE) %>% left_join(gene_annotation,by=c("gene"="gene_id")) 
# df1 %>% count(chr)
# cat("{OUTPUT}/Br_rat6k__predict.txt has no chr 1 genes!!! Where did I lose them?\n")

pred_expr = read_pred_expr(glue("{OUTPUT}/AC-filtered__predict.txt"))

Rows: 5628 Columns: 5881
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr    (2): FID, IID
dbl (5879): ENSRNOG00000015552, ENSRNOG00000016054, ENSRNOG00000049505, ENSR...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Code

## checked that I have chr1 genes
df1= tibble(gene=names(pred_expr)[-1]) %>% mutate(isinac=TRUE) %>% left_join(gene_annotation,by=c("gene"="gene_id")) 
df1 %>% count(chr)

# A tibble: 20 × 2
   chr       n
   <chr> <int>
 1 1       748
 2 10      542
 3 11      166
 4 12      186
 5 13      190
 6 14      228
 7 15      177
 8 16      177
 9 17      127
10 18      171
11 19      187
12 2       396
13 20      203
14 3       452
15 4       344
16 5       387
17 6       267
18 7       394
19 8       312
20 9       225

Code

cat("{OUTPUT}/AC-filtered__predict.txt has genes in ",length(unique(df1$chr)), "chr")

{OUTPUT}/AC-filtered__predict.txt has genes in  20 chr

define function lmm with GRM

Code

## HERE WE USE THE FULL GRM MATRIX AND CALCULATE THE INVERSE OF THE SIGMA MATRIX
## define lmm association function 
lmmGRM = function(pheno, grm_mat, h2, pred_expr,pheno_id_col=1,pheno_value_cols=6:6,out=NULL)
{
  ## input pheno is a data frame with id column pheno_id_col=1 by default
  ## phenotype values are in pheno_value_cols, 6:6 by default (SCORE column location in plink output), it can have more than one phenotype
  ## but h2 has to be the same, this is useful when running simulations with different h2
  ## call lmmXcan(pheno %>% select(IID,SCORE))
  
  ## format pheno to matrix form
  phenomat <- as.matrix(pheno[,pheno_value_cols])
  rownames(phenomat) <- pheno[[pheno_id_col]]
  
  ## turn pred_expr into matrix with rownames =IID, keep only IIDs in ymat
  exp_mat = as.matrix(pred_expr %>% select(-IID))
  rownames(exp_mat) = pred_expr$IID

  ## align pheno and expr matrices
  idlist = intersect(rownames(phenomat), rownames(exp_mat))
  
  nsam = length(idlist)
  
  ## CALCULATE SIGMA
  ID_mat = diag(rep(1,nsam))
  
  #testing_scale_grm = TRUE
  #if(testing_scale_grm) grm_mat = sweep( sweep(grm_mat,2, 1/sqrt(diag(grm_mat)),"*"), 1, 1/sqrt(diag(grm_mat)),"*")    
  
  Sigma = grm_mat[idlist,idlist] * h2 + (1 - h2) * ID_mat
  
  Sig_eigen = eigen(Sigma)
  rownames(Sig_eigen$vectors) = rownames(Sigma)
  
  isighalf = Sig_eigen$vectors %*% diag( 1 / sqrt(  Sig_eigen$values  ) ) %*% t(Sig_eigen$vectors)
  
  ## perform raw association
  cormat_raw = matrix_lm(phenomat[idlist,, drop = FALSE], exp_mat[idlist,])
  pmat_raw = cor2pval(cormat_raw,nsam)
  colnames(pmat_raw) <- gsub("cor_", "pval_", colnames(pmat_raw))
  
  ## perform corrected association
  cormat_correct = matrix_lm(isighalf%*% phenomat[idlist,, drop = FALSE], isighalf %*% exp_mat[idlist,])
  pmat_correct = cor2pval(cormat_correct,nsam)
  colnames(pmat_correct) <- gsub("cor_", "pval_", colnames(pmat_correct))
  
  if(!is.null(out))
  {
    saveRDS(cormat_correct,file = glue("{out}_cormat_correct.RDS"))
    saveRDS(pmat_correct,  file = glue("{out}_pmat_correct.RDS"))
    saveRDS(cormat_raw,    file = glue("{out}_cormat_raw.RDS"))
    saveRDS(pmat_raw,      file = glue("{out}_pmat_raw.RDS"))
  }
  res = list(
    cormat_correct=cormat_correct, 
    pmat_correct=pmat_correct, 
    cormat_raw=cormat_raw, 
    pmat_raw=pmat_raw)
  res
  
}

run ratXcan

Code

## read simulated Y
pheno = read_table(glue("{WEBDATA}/2023-11-27-ratxcan-tutorial/scratch/sim/tempo/PRS_output_100-32240.profile"))


── Column specification ────────────────────────────────────────────────────────
cols(
  FID = col_character(),
  IID = col_character(),
  PHENO = col_double(),
  CNT = col_double(),
  CNT2 = col_double(),
  SCORE = col_double()
)

Code

## calculate h2 of pheno
## mpheno 4 means fourth column after FID and IID
## $GCTA --grm $OUTPUT/rat6k_autosome --reml --pheno $OUTPUT/sim/tempo/PRS_output_100-32240.profile --mpheno 4 --out $OUTPUT/test-2024-08-29

## create phenotype with h2
h2=0.5
pheno$trait = (pheno$SCORE - mean(pheno$SCORE))/sd(pheno$SCORE) * sqrt(h2) + sqrt(1 - h2)*rnorm(nrow(pheno))

if(recalc)
{res = lmmGRM(pheno, grm_mat, h2, pred_expr,pheno_id_col=1,pheno_value_cols=7:7,out=NULL)
saveRDS(res,glue("{OUT_TEMPO}/ratxcan-ac-res.RDS"))
} else
  res = readRDS(glue("{OUT_TEMPO}/ratxcan-ac-res.RDS"))

calc GRM without chr 1

Code

$PLINK --bfile $GENO/rat6k_autosome --chr 1 --write-snplist --out $OUT_TEMPO/chr1_snps

$GCTA --bfile $GENO/rat6k_autosome --exclude $OUT_TEMPO/chr1_snps.snplist --make-grm-bin --out $OUT_TEMPO/rat6k_autosome_loco_chr01 --thread-num 8
## that took

Code

*******************************************************************
* Genome-wide Complex Trait Analysis (GCTA)
* version v1.94.1 Mac
* (C) 2010-present, Yang Lab, Westlake University
* Please report bugs to Jian Yang <jian.yang@westlake.edu.cn>
*******************************************************************
Analysis started at 16:52:27 CDT on Thu Aug 29 2024.
Hostname: MED-ML-464.local

Accepted options:
--bfile /Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome
--exclude /Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/testing-2024-08-29/chr1_snps.snplist
--make-grm-bin
--out /Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1
--thread-num 8

Note: the program will be running on 8 threads.

Reading PLINK FAM file from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.fam].
5628 individuals to be included from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.fam].
Reading PLINK BIM file from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.bim].
179895 SNPs to be included from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.bim].
Reading a list of SNPs from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/testing-2024-08-29/chr1_snps.snplist].
17602 SNPs are excluded from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/testing-2024-08-29/chr1_snps.snplist] and there are 162293 SNPs remaining.
Reading PLINK BED file from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.bed] in SNP-major format ...
Genotype data for 5628 individuals and 162293 SNPs to be included from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.bed].
Calculating allele frequencies ...
Recoding genotypes (individual major mode) ...

Calculating the genetic relationship matrix (GRM) ... (Note: default speed-optimized mode, may use huge RAM)

Summary of the GRM:
Mean of diagonals = 0.990807
Variance of diagonals = 0.00404564
Mean of off-diagonals = -0.000176097
Variance of off-diagonals = 0.00216189
GRM of 5628 individuals has been saved in the file [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1.grm.bin] (in binary format).
Number of SNPs to calculate the genetic relationship between each pair of individuals has been saved in the file [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1.grm.N.bin] (in binary format).
IDs for the GRM file [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1.grm.bin] have been saved in the file [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1.grm.id].

compute GRM loco chr01 and lmmGRM with updated h2

Code

grm_mat_no_chr01 = read_GRMBin(glue("{OUT_TEMPO}/rat6k_autosome_loco_chr01.grm"))


## need to recalculate h2
# $GCTA --reml --grm $OUT_TEMPO/rat6k_autosome  --out $OUT_TEMPO/rat6k_autosome_loco_chr01 --thread-num 8

## save the phenotype with error added
write_tsv(pheno %>% select(IID, FID, trait),file=glue("{OUT_TEMPO}/pheno_PRS_output_100-32240.txt"))

# $GCTA --grm $OUT_TEMPO/rat6k_autosome_loco_chr01 --reml --pheno $OUT_TEMPO/pheno_PRS_output_100-32240.txt --mpheno 1 --out $OUT_TEMPO/loco_chr01 --thread-num 8

h2_loco_chr01 = read_tsv(glue("{OUT_TEMPO}/loco_chr01.hsq")) %>% filter(Source=="V(G)/Vp") %>% pull(Variance)

Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)

Rows: 10 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): Source
dbl (2): Variance, SE

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Code

if(recalc)
{
  res_no_chr01 = lmmGRM(pheno, grm_mat_no_chr01, h2= h2_loco_chr01, pred_expr,pheno_id_col=1,pheno_value_cols=7:7,out=NULL)
  saveRDS(res_no_chr01,file=glue("{OUT_TEMPO}/ratxcan-ac-res-loco-chr01.RDS"))
} else res_no_chr01=readRDS(glue("{OUT_TEMPO}/ratxcan-ac-res-loco-chr01.RDS"))

compare pvalues in chr 1 vs other chromosomes

Code

pmat2df = function(pmat)
{
  df = data.frame(pmat)
  df$gene = rownames(pmat)
  rownames(df) = NULL
  df %>% left_join(gene_annotation %>% select(gene_id,chr,gene_name),by=c("gene"="gene_id") )
}

df_loco_chr01 = pmat2df(res_no_chr01$pmat_correct) 
df = pmat2df(res$pmat_correct) 

par(mfrow=c(2,2))
hist(df_loco_chr01 %>% filter(chr=="1") %>% pull(trait),main="ratxcan GRM no chr1 - genes in chr1")
hist(df_loco_chr01 %>% filter(chr!="1") %>% pull(trait),main="ratxcan GRM no chr1 - genes chr!=1")
hist(df %>% filter(chr=="1") %>% pull(trait),main="ratxcan GRM all - genes in chr1")
hist(df %>% filter(chr!="1") %>% pull(trait),main="ratxcan GRM all - genes chr!=1")

Code

par(mfrow=c(1,1))

par(mfrow=c(2,2))
rango = range(-log10(c(df_loco_chr01$trait, df$trait)))
qqunif(df_loco_chr01 %>% filter(chr=="1") %>% pull(trait),main="ratxcan GRM no chr1 - genes in chr1",BH=FALSE,CI=FALSE,xlim=rango,ylim=rango)
qqunif(df_loco_chr01 %>% filter(chr!="1") %>% pull(trait),main="ratxcan GRM no chr1 - genes chr!=1",BH=FALSE,CI=FALSE,xlim=rango,ylim=rango)
qqunif(df %>% filter(chr=="1") %>% pull(trait),main="ratxcan genes in chr1",BH=FALSE,CI=FALSE,xlim=rango,ylim=rango)
qqunif(df %>% filter(chr!="1") %>% pull(trait),main="ratxcan genes chr!=1",BH=FALSE,CI=FALSE,xlim=rango,ylim=rango)

Code

par(mfrow=c(1,1))


qqunif(df_loco_chr01 %>% filter(chr=="1") %>% pull(trait),main="ratxcan GRM no chr1 - genes in chr1")

Take home

removing chr1 from GRM negates the correction of ratxcan in chr1.

phenotype was simulated to have a relatedness modeling random effect (covariance=GRM), no conexion to any of the genes. Any significant association should be considered false positive.

load libraries and functions

define data and software paths for R

define data and software for the terminal

read grm matrix

define myplot

NEED DEBUG simulate \(Y = Sigma^{1/2}\epsilon\) and run assoc with expr_mat

read gene annotation

read predicted expression

define function lmm with GRM

run ratXcan

calc GRM without chr 1

compute GRM loco chr01 and lmmGRM with updated h2

compare pvalues in chr 1 vs other chromosomes

Reuse