WEBDATA="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"PLINK="/Users/haekyungim/bin/plink_mac_20231211/plink"GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64"INPUT <-glue("{WEBDATA}/ratxcan-tutorial") ## this has the input data OUTPUT <-glue("{WEBDATA}/2023-11-27-ratxcan-tutorial/scratch") ## this has the output data, intermediate resultsGENO =glue("{WEBDATA}/2023-11-27-ratxcan-tutorial/data/genotype/")OUT_TEMPO =glue("{OUTPUT}/testing-2024-08-29")
read_pred_expr =function(filename){##usage: Br_pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt")) pred_expr <- vroom::vroom(filename) %>%select(-FID) %>%# Remove the FID columnmutate(IID =str_split(IID, "_", simplify =TRUE)[, 1]) # Keep the first part of IID pred_expr}# pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))# ## WHYYYY br-hki-rat6k__predict.txt has no chr 1????# df1= tibble(gene=names(pred_expr)) %>% mutate(isinac=TRUE) %>% left_join(gene_annotation,by=c("gene"="gene_id")) # df1 %>% count(chr)# cat("{OUTPUT}/Br_rat6k__predict.txt has no chr 1 genes!!! Where did I lose them?\n")pred_expr =read_pred_expr(glue("{OUTPUT}/AC-filtered__predict.txt"))
Rows: 5628 Columns: 5881
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): FID, IID
dbl (5879): ENSRNOG00000015552, ENSRNOG00000016054, ENSRNOG00000049505, ENSR...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
## checked that I have chr1 genesdf1=tibble(gene=names(pred_expr)[-1]) %>%mutate(isinac=TRUE) %>%left_join(gene_annotation,by=c("gene"="gene_id")) df1 %>%count(chr)
cat("{OUTPUT}/AC-filtered__predict.txt has genes in ",length(unique(df1$chr)), "chr")
{OUTPUT}/AC-filtered__predict.txt has genes in 20 chr
define function lmm with GRM
Code
## HERE WE USE THE FULL GRM MATRIX AND CALCULATE THE INVERSE OF THE SIGMA MATRIX## define lmm association function lmmGRM =function(pheno, grm_mat, h2, pred_expr,pheno_id_col=1,pheno_value_cols=6:6,out=NULL){## input pheno is a data frame with id column pheno_id_col=1 by default## phenotype values are in pheno_value_cols, 6:6 by default (SCORE column location in plink output), it can have more than one phenotype## but h2 has to be the same, this is useful when running simulations with different h2## call lmmXcan(pheno %>% select(IID,SCORE))## format pheno to matrix form phenomat <-as.matrix(pheno[,pheno_value_cols])rownames(phenomat) <- pheno[[pheno_id_col]]## turn pred_expr into matrix with rownames =IID, keep only IIDs in ymat exp_mat =as.matrix(pred_expr %>%select(-IID))rownames(exp_mat) = pred_expr$IID## align pheno and expr matrices idlist =intersect(rownames(phenomat), rownames(exp_mat)) nsam =length(idlist)## CALCULATE SIGMA ID_mat =diag(rep(1,nsam))#testing_scale_grm = TRUE#if(testing_scale_grm) grm_mat = sweep( sweep(grm_mat,2, 1/sqrt(diag(grm_mat)),"*"), 1, 1/sqrt(diag(grm_mat)),"*") Sigma = grm_mat[idlist,idlist] * h2 + (1- h2) * ID_mat Sig_eigen =eigen(Sigma)rownames(Sig_eigen$vectors) =rownames(Sigma) isighalf = Sig_eigen$vectors %*%diag( 1/sqrt( Sig_eigen$values ) ) %*%t(Sig_eigen$vectors)## perform raw association cormat_raw =matrix_lm(phenomat[idlist,, drop =FALSE], exp_mat[idlist,]) pmat_raw =cor2pval(cormat_raw,nsam)colnames(pmat_raw) <-gsub("cor_", "pval_", colnames(pmat_raw))## perform corrected association cormat_correct =matrix_lm(isighalf%*% phenomat[idlist,, drop =FALSE], isighalf %*% exp_mat[idlist,]) pmat_correct =cor2pval(cormat_correct,nsam)colnames(pmat_correct) <-gsub("cor_", "pval_", colnames(pmat_correct))if(!is.null(out)) {saveRDS(cormat_correct,file =glue("{out}_cormat_correct.RDS"))saveRDS(pmat_correct, file =glue("{out}_pmat_correct.RDS"))saveRDS(cormat_raw, file =glue("{out}_cormat_raw.RDS"))saveRDS(pmat_raw, file =glue("{out}_pmat_raw.RDS")) } res =list(cormat_correct=cormat_correct, pmat_correct=pmat_correct, cormat_raw=cormat_raw, pmat_raw=pmat_raw) res}
## calculate h2 of pheno## mpheno 4 means fourth column after FID and IID## $GCTA --grm $OUTPUT/rat6k_autosome --reml --pheno $OUTPUT/sim/tempo/PRS_output_100-32240.profile --mpheno 4 --out $OUTPUT/test-2024-08-29## create phenotype with h2h2=0.5pheno$trait = (pheno$SCORE -mean(pheno$SCORE))/sd(pheno$SCORE) *sqrt(h2) +sqrt(1- h2)*rnorm(nrow(pheno))if(recalc){res =lmmGRM(pheno, grm_mat, h2, pred_expr,pheno_id_col=1,pheno_value_cols=7:7,out=NULL)saveRDS(res,glue("{OUT_TEMPO}/ratxcan-ac-res.RDS"))} else res =readRDS(glue("{OUT_TEMPO}/ratxcan-ac-res.RDS"))
calc GRM without chr 1
Code
$PLINK--bfile$GENO/rat6k_autosome --chr 1 --write-snplist--out$OUT_TEMPO/chr1_snps$GCTA--bfile$GENO/rat6k_autosome --exclude$OUT_TEMPO/chr1_snps.snplist --make-grm-bin--out$OUT_TEMPO/rat6k_autosome_loco_chr01 --thread-num 8## that took
Code
*******************************************************************
* Genome-wide Complex Trait Analysis (GCTA)
* version v1.94.1 Mac
* (C) 2010-present, Yang Lab, Westlake University
* Please report bugs to Jian Yang <jian.yang@westlake.edu.cn>
*******************************************************************
Analysis started at 16:52:27 CDT on Thu Aug 29 2024.
Hostname: MED-ML-464.local
Accepted options:
--bfile /Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome
--exclude /Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/testing-2024-08-29/chr1_snps.snplist
--make-grm-bin
--out /Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1
--thread-num 8
Note: the program will be running on 8 threads.
Reading PLINK FAM file from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.fam].
5628 individuals to be included from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.fam].
Reading PLINK BIM file from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.bim].
179895 SNPs to be included from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.bim].
Reading a list of SNPs from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/testing-2024-08-29/chr1_snps.snplist].
17602 SNPs are excluded from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/testing-2024-08-29/chr1_snps.snplist] and there are 162293 SNPs remaining.
Reading PLINK BED file from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.bed] in SNP-major format ...
Genotype data for 5628 individuals and 162293 SNPs to be included from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.bed].
Calculating allele frequencies ...
Recoding genotypes (individual major mode) ...
Calculating the genetic relationship matrix (GRM) ... (Note: default speed-optimized mode, may use huge RAM)
Summary of the GRM:
Mean of diagonals = 0.990807
Variance of diagonals = 0.00404564
Mean of off-diagonals = -0.000176097
Variance of off-diagonals = 0.00216189
GRM of 5628 individuals has been saved in the file [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1.grm.bin] (in binary format).
Number of SNPs to calculate the genetic relationship between each pair of individuals has been saved in the file [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1.grm.N.bin] (in binary format).
IDs for the GRM file [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1.grm.bin] have been saved in the file [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1.grm.id].
compute GRM loco chr01 and lmmGRM with updated h2
Code
grm_mat_no_chr01 =read_GRMBin(glue("{OUT_TEMPO}/rat6k_autosome_loco_chr01.grm"))## need to recalculate h2# $GCTA --reml --grm $OUT_TEMPO/rat6k_autosome --out $OUT_TEMPO/rat6k_autosome_loco_chr01 --thread-num 8## save the phenotype with error addedwrite_tsv(pheno %>%select(IID, FID, trait),file=glue("{OUT_TEMPO}/pheno_PRS_output_100-32240.txt"))# $GCTA --grm $OUT_TEMPO/rat6k_autosome_loco_chr01 --reml --pheno $OUT_TEMPO/pheno_PRS_output_100-32240.txt --mpheno 1 --out $OUT_TEMPO/loco_chr01 --thread-num 8h2_loco_chr01 =read_tsv(glue("{OUT_TEMPO}/loco_chr01.hsq")) %>%filter(Source=="V(G)/Vp") %>%pull(Variance)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
Rows: 10 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): Source
dbl (2): Variance, SE
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pmat2df =function(pmat){ df =data.frame(pmat) df$gene =rownames(pmat)rownames(df) =NULL df %>%left_join(gene_annotation %>%select(gene_id,chr,gene_name),by=c("gene"="gene_id") )}df_loco_chr01 =pmat2df(res_no_chr01$pmat_correct) df =pmat2df(res$pmat_correct) par(mfrow=c(2,2))hist(df_loco_chr01 %>%filter(chr=="1") %>%pull(trait),main="ratxcan GRM no chr1 - genes in chr1")hist(df_loco_chr01 %>%filter(chr!="1") %>%pull(trait),main="ratxcan GRM no chr1 - genes chr!=1")hist(df %>%filter(chr=="1") %>%pull(trait),main="ratxcan GRM all - genes in chr1")hist(df %>%filter(chr!="1") %>%pull(trait),main="ratxcan GRM all - genes chr!=1")
Code
par(mfrow=c(1,1))par(mfrow=c(2,2))rango =range(-log10(c(df_loco_chr01$trait, df$trait)))qqunif(df_loco_chr01 %>%filter(chr=="1") %>%pull(trait),main="ratxcan GRM no chr1 - genes in chr1",BH=FALSE,CI=FALSE,xlim=rango,ylim=rango)qqunif(df_loco_chr01 %>%filter(chr!="1") %>%pull(trait),main="ratxcan GRM no chr1 - genes chr!=1",BH=FALSE,CI=FALSE,xlim=rango,ylim=rango)qqunif(df %>%filter(chr=="1") %>%pull(trait),main="ratxcan genes in chr1",BH=FALSE,CI=FALSE,xlim=rango,ylim=rango)qqunif(df %>%filter(chr!="1") %>%pull(trait),main="ratxcan genes chr!=1",BH=FALSE,CI=FALSE,xlim=rango,ylim=rango)
Code
par(mfrow=c(1,1))qqunif(df_loco_chr01 %>%filter(chr=="1") %>%pull(trait),main="ratxcan GRM no chr1 - genes in chr1")
Take home
removing chr1 from GRM negates the correction of ratxcan in chr1.
phenotype was simulated to have a relatedness modeling random effect (covariance=GRM), no conexion to any of the genes. Any significant association should be considered false positive.
---title: compare ratxcan with and without locodate: 2024-08-29author: Haky Imeditor_options: chunk_output_type: consoledescription: compare ratxcan mixed effects with and without loco---## load libraries and functions```{r load libraries and functions}#options(error=recover)#options(error=browser)options(error=NULL)## compare observed correlation with null correlationsuppressMessages(devtools::source_gist("a925fea01b365a8c605e")) ## load qqR fn https://gist.github.com/hakyim/a925fea01b365a8c605esuppressMessages(devtools::source_gist("38431b74c6c0bf90c12f")) ## qqunif https://gist.github.com/hakyim/38431b74c6c0bf90c12fsuppressMessages(devtools::source_gist("115403f16bec0a0e871f3616d552ce9b")) ## source ratxcan functions https://gist.github.com/hakyim/115403f16bec0a0e871f3616d552ce9b suppressMessages(library(tidyverse))suppressMessages(library(glue))suppressMessages(library(RSQLite))#suppressMessages(library(expm))#suppressMessages(library(readxl))# install.packages("devtools")# library("devtools")# install_github("jdstorey/qvalue")suppressMessages(library(qvalue))# if (!require("BiocManager", quietly = TRUE))# install.packages("BiocManager")# # BiocManager::install("biomaRt")##suppressMessages(library(biomaRt))##install.packages("ggrepel")suppressMessages(library(ggrepel))recalc=FALSE```## define data and software paths for R```{r define data and software paths}WEBDATA="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"PLINK="/Users/haekyungim/bin/plink_mac_20231211/plink"GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64"INPUT <-glue("{WEBDATA}/ratxcan-tutorial") ## this has the input data OUTPUT <-glue("{WEBDATA}/2023-11-27-ratxcan-tutorial/scratch") ## this has the output data, intermediate resultsGENO =glue("{WEBDATA}/2023-11-27-ratxcan-tutorial/data/genotype/")OUT_TEMPO =glue("{OUTPUT}/testing-2024-08-29")```## define data and software for the terminal```{bash define data and software paths bash}#| eval: FALSEWEBDATA="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"PLINK="/Users/haekyungim/bin/plink_mac_20231211/plink"GCTA="/Users/haekyungim/bin/gcta-1.94.2-MacOS-ARM-x86_64/gcta64"INPUT=$WEBDATA/ratxcan-tutorialOUTPUT=$WEBDATA/2023-11-27-ratxcan-tutorial/scratchGENO=$WEBDATA/2023-11-27-ratxcan-tutorial/data/genotype/OUT_TEMPO=$OUTPUT/testing-2024-08-29```## read grm matrix```{r read GRM mat}grm_mat <-read_GRMBin(glue("{OUTPUT}/rat6k_autosome.grm"))```## define myplot```{r define myplot}myplot <-function(tempres, post_titulo="",semilla="") {# Create a data frame with specific columns df <-data.frame(p0.01_yes =apply(tempres$pmat_correct, 2, function(x) mean(x <0.01)),p0.01_no =apply(tempres$pmat_raw, 2, function(x) mean(x <0.01)),p0.05_yes =apply(tempres$pmat_correct, 2, function(x) mean(x <0.05)),p0.05_no =apply(tempres$pmat_raw, 2, function(x) mean(x <0.05)),p0.10_yes =apply(tempres$pmat_correct, 2, function(x) mean(x <0.10)), p0.10_no =apply(tempres$pmat_raw, 2, function(x) mean(x <0.10))# ... [rest of your code for creating df] ... )# Pivot the data frame to long format, specifying the columns to keepdf_long <-pivot_longer(df, cols =starts_with("p"))df_long <- df_long %>%separate(name,into =c("threshold","corrected"),sep="_") %>%rename(proportion=value)# Rename the name column to replace p0.xx with p<0.xxdf_long <- df_long %>%mutate(threshold =gsub("p0\\.", "p<0.", threshold))# Create boxplots with mean pp <-ggplot(df_long, aes(x = threshold, y = proportion, fill = corrected)) +geom_boxplot(alpha =0.6) +stat_summary(fun = mean, geom ="point", shape =3, size =2, stroke =2, color ="blue", #position = position_dodge(width = 0.8)) +position =position_dodge(width =-0.1)) +#stat_summary(fun = mean, geom = "crossbar", size = .5, color = "blue") +#stat_summary(fun = mean, geom = "crossbar", size = .5, color = "darkgray") +geom_hline(yintercept =c(0.01, 0.05, 0.10), linetype ="dashed", color ="gray") +theme_minimal(base_size =17) +ggtitle(glue("Type I Error Calibration {semilla} {post_titulo}")) +xlab("significance") +ylab("false positive rate") pp}```### NEED DEBUG simulate $Y = Sigma^{1/2}\epsilon$ and run assoc with expr_mat```{r LONG RUN Y sigma^.5 epsimat}#| eval: FALSEnsam=nrow(grm_mat)#ind=1:nsamind=1:1000test_mat = grm_mat[ind,ind]nsam=nrow(test_mat)Sigma = test_mat * h2 + (1- h2) *diag(rep(1,nsam))Sig_eigen =eigen(Sigma)rownames(Sig_eigen$vectors) =rownames(Sigma)##sighalf = Sig_eigen$vectors %*% diag( sqrt( Sig_eigen$values ) ) %*% t(Sig_eigen$vectors)## make this multiplication more efficient using sweepsighalf = Sig_eigen$vectors %*%sweep(t(Sig_eigen$vectors),1,sqrt( Sig_eigen$values ),"*")sim_sigma_pheno = sighalf %*%matrix(rnorm(nsam * nsim), nsam, nsim) sim_sigma_pheno=cbind(FID=rownames(sim_sigma_pheno),IID=rownames(sim_sigma_pheno),as.data.frame(sim_sigma_pheno))tic=Sys.time()tempres_sigma_pheno <-lmmGRM(sim_sigma_pheno,grm_mat, h2,pred_expr,pheno_id_col=1, pheno_value_cols=2+(1:nsim))toc=Sys.time()toc - ticpp<-myplot(tempres_sigma_pheno,post_titulo =glue("sigma_pheno n= {nsam} - ii={ii}"))cat(ii,"\n")print(pp)```## read gene annotation```{r read gene annotation}#gene_annotation <- readRDS(glue("{INPUT}/data/expression/gene_annotation.RDS"))gene_annotation <-readRDS(glue("{WEBDATA}/2023-11-27-ratxcan-tutorial/data/expression/gene_annotation.RDS"))```## read predicted expression```{r read predicted expression}read_pred_expr =function(filename){##usage: Br_pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt")) pred_expr <- vroom::vroom(filename) %>%select(-FID) %>%# Remove the FID columnmutate(IID =str_split(IID, "_", simplify =TRUE)[, 1]) # Keep the first part of IID pred_expr}# pred_expr = read_pred_expr(glue("{OUTPUT}/Br-hki-rat6k__predict.txt"))# ## WHYYYY br-hki-rat6k__predict.txt has no chr 1????# df1= tibble(gene=names(pred_expr)) %>% mutate(isinac=TRUE) %>% left_join(gene_annotation,by=c("gene"="gene_id")) # df1 %>% count(chr)# cat("{OUTPUT}/Br_rat6k__predict.txt has no chr 1 genes!!! Where did I lose them?\n")pred_expr =read_pred_expr(glue("{OUTPUT}/AC-filtered__predict.txt"))## checked that I have chr1 genesdf1=tibble(gene=names(pred_expr)[-1]) %>%mutate(isinac=TRUE) %>%left_join(gene_annotation,by=c("gene"="gene_id")) df1 %>%count(chr)cat("{OUTPUT}/AC-filtered__predict.txt has genes in ",length(unique(df1$chr)), "chr")```## define function lmm with GRM```{r define lmmGRM function}## HERE WE USE THE FULL GRM MATRIX AND CALCULATE THE INVERSE OF THE SIGMA MATRIX## define lmm association function lmmGRM =function(pheno, grm_mat, h2, pred_expr,pheno_id_col=1,pheno_value_cols=6:6,out=NULL){## input pheno is a data frame with id column pheno_id_col=1 by default## phenotype values are in pheno_value_cols, 6:6 by default (SCORE column location in plink output), it can have more than one phenotype## but h2 has to be the same, this is useful when running simulations with different h2## call lmmXcan(pheno %>% select(IID,SCORE))## format pheno to matrix form phenomat <-as.matrix(pheno[,pheno_value_cols])rownames(phenomat) <- pheno[[pheno_id_col]]## turn pred_expr into matrix with rownames =IID, keep only IIDs in ymat exp_mat =as.matrix(pred_expr %>%select(-IID))rownames(exp_mat) = pred_expr$IID## align pheno and expr matrices idlist =intersect(rownames(phenomat), rownames(exp_mat)) nsam =length(idlist)## CALCULATE SIGMA ID_mat =diag(rep(1,nsam))#testing_scale_grm = TRUE#if(testing_scale_grm) grm_mat = sweep( sweep(grm_mat,2, 1/sqrt(diag(grm_mat)),"*"), 1, 1/sqrt(diag(grm_mat)),"*") Sigma = grm_mat[idlist,idlist] * h2 + (1- h2) * ID_mat Sig_eigen =eigen(Sigma)rownames(Sig_eigen$vectors) =rownames(Sigma) isighalf = Sig_eigen$vectors %*%diag( 1/sqrt( Sig_eigen$values ) ) %*%t(Sig_eigen$vectors)## perform raw association cormat_raw =matrix_lm(phenomat[idlist,, drop =FALSE], exp_mat[idlist,]) pmat_raw =cor2pval(cormat_raw,nsam)colnames(pmat_raw) <-gsub("cor_", "pval_", colnames(pmat_raw))## perform corrected association cormat_correct =matrix_lm(isighalf%*% phenomat[idlist,, drop =FALSE], isighalf %*% exp_mat[idlist,]) pmat_correct =cor2pval(cormat_correct,nsam)colnames(pmat_correct) <-gsub("cor_", "pval_", colnames(pmat_correct))if(!is.null(out)) {saveRDS(cormat_correct,file =glue("{out}_cormat_correct.RDS"))saveRDS(pmat_correct, file =glue("{out}_pmat_correct.RDS"))saveRDS(cormat_raw, file =glue("{out}_cormat_raw.RDS"))saveRDS(pmat_raw, file =glue("{out}_pmat_raw.RDS")) } res =list(cormat_correct=cormat_correct, pmat_correct=pmat_correct, cormat_raw=cormat_raw, pmat_raw=pmat_raw) res}```## run ratXcan```{r}## read simulated Ypheno =read_table(glue("{WEBDATA}/2023-11-27-ratxcan-tutorial/scratch/sim/tempo/PRS_output_100-32240.profile"))## calculate h2 of pheno## mpheno 4 means fourth column after FID and IID## $GCTA --grm $OUTPUT/rat6k_autosome --reml --pheno $OUTPUT/sim/tempo/PRS_output_100-32240.profile --mpheno 4 --out $OUTPUT/test-2024-08-29## create phenotype with h2h2=0.5pheno$trait = (pheno$SCORE -mean(pheno$SCORE))/sd(pheno$SCORE) *sqrt(h2) +sqrt(1- h2)*rnorm(nrow(pheno))if(recalc){res =lmmGRM(pheno, grm_mat, h2, pred_expr,pheno_id_col=1,pheno_value_cols=7:7,out=NULL)saveRDS(res,glue("{OUT_TEMPO}/ratxcan-ac-res.RDS"))} else res =readRDS(glue("{OUT_TEMPO}/ratxcan-ac-res.RDS"))```## calc GRM without chr 1```{bash calculate GRM without chr01}#| eval: false$PLINK--bfile$GENO/rat6k_autosome --chr 1 --write-snplist--out$OUT_TEMPO/chr1_snps$GCTA--bfile$GENO/rat6k_autosome --exclude$OUT_TEMPO/chr1_snps.snplist --make-grm-bin--out$OUT_TEMPO/rat6k_autosome_loco_chr01 --thread-num 8## that took ``````{text }******************************************************************** Genome-wide Complex Trait Analysis (GCTA)* version v1.94.1 Mac* (C) 2010-present, Yang Lab, Westlake University* Please report bugs to Jian Yang <jian.yang@westlake.edu.cn>*******************************************************************Analysis started at 16:52:27 CDT on Thu Aug 29 2024.Hostname: MED-ML-464.localAccepted options:--bfile /Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome--exclude /Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/testing-2024-08-29/chr1_snps.snplist--make-grm-bin--out /Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1--thread-num 8Note: the program will be running on 8 threads.Reading PLINK FAM file from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.fam].5628 individuals to be included from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.fam].Reading PLINK BIM file from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.bim].179895 SNPs to be included from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.bim].Reading a list of SNPs from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/testing-2024-08-29/chr1_snps.snplist].17602 SNPs are excluded from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/testing-2024-08-29/chr1_snps.snplist] and there are 162293 SNPs remaining.Reading PLINK BED file from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.bed] in SNP-major format ...Genotype data for 5628 individuals and 162293 SNPs to be included from [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/data/genotype//rat6k_autosome.bed].Calculating allele frequencies ...Recoding genotypes (individual major mode) ...Calculating the genetic relationship matrix (GRM) ... (Note: default speed-optimized mode, may use huge RAM)Summary of the GRM:Mean of diagonals = 0.990807Variance of diagonals = 0.00404564Mean of off-diagonals = -0.000176097Variance of off-diagonals = 0.00216189GRM of 5628 individuals has been saved in the file [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1.grm.bin] (in binary format).Number of SNPs to calculate the genetic relationship between each pair of individuals has been saved in the file [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1.grm.N.bin] (in binary format).IDs for the GRM file [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1.grm.bin] have been saved in the file [/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-11-27-ratxcan-tutorial/scratch/rat6k_autosome_loco_chr1.grm.id].```## compute GRM loco chr01 and lmmGRM with updated h2```{r compute GRM no chr01 and lmmGRM}grm_mat_no_chr01 =read_GRMBin(glue("{OUT_TEMPO}/rat6k_autosome_loco_chr01.grm"))## need to recalculate h2# $GCTA --reml --grm $OUT_TEMPO/rat6k_autosome --out $OUT_TEMPO/rat6k_autosome_loco_chr01 --thread-num 8## save the phenotype with error addedwrite_tsv(pheno %>%select(IID, FID, trait),file=glue("{OUT_TEMPO}/pheno_PRS_output_100-32240.txt"))# $GCTA --grm $OUT_TEMPO/rat6k_autosome_loco_chr01 --reml --pheno $OUT_TEMPO/pheno_PRS_output_100-32240.txt --mpheno 1 --out $OUT_TEMPO/loco_chr01 --thread-num 8h2_loco_chr01 =read_tsv(glue("{OUT_TEMPO}/loco_chr01.hsq")) %>%filter(Source=="V(G)/Vp") %>%pull(Variance)if(recalc){ res_no_chr01 =lmmGRM(pheno, grm_mat_no_chr01, h2= h2_loco_chr01, pred_expr,pheno_id_col=1,pheno_value_cols=7:7,out=NULL)saveRDS(res_no_chr01,file=glue("{OUT_TEMPO}/ratxcan-ac-res-loco-chr01.RDS"))} else res_no_chr01=readRDS(glue("{OUT_TEMPO}/ratxcan-ac-res-loco-chr01.RDS"))```## compare pvalues in chr 1 vs other chromosomes```{r compare pvalues in chr 1 vs other chromosomes}pmat2df =function(pmat){ df =data.frame(pmat) df$gene =rownames(pmat)rownames(df) =NULL df %>%left_join(gene_annotation %>%select(gene_id,chr,gene_name),by=c("gene"="gene_id") )}df_loco_chr01 =pmat2df(res_no_chr01$pmat_correct) df =pmat2df(res$pmat_correct) par(mfrow=c(2,2))hist(df_loco_chr01 %>%filter(chr=="1") %>%pull(trait),main="ratxcan GRM no chr1 - genes in chr1")hist(df_loco_chr01 %>%filter(chr!="1") %>%pull(trait),main="ratxcan GRM no chr1 - genes chr!=1")hist(df %>%filter(chr=="1") %>%pull(trait),main="ratxcan GRM all - genes in chr1")hist(df %>%filter(chr!="1") %>%pull(trait),main="ratxcan GRM all - genes chr!=1")par(mfrow=c(1,1))par(mfrow=c(2,2))rango =range(-log10(c(df_loco_chr01$trait, df$trait)))qqunif(df_loco_chr01 %>%filter(chr=="1") %>%pull(trait),main="ratxcan GRM no chr1 - genes in chr1",BH=FALSE,CI=FALSE,xlim=rango,ylim=rango)qqunif(df_loco_chr01 %>%filter(chr!="1") %>%pull(trait),main="ratxcan GRM no chr1 - genes chr!=1",BH=FALSE,CI=FALSE,xlim=rango,ylim=rango)qqunif(df %>%filter(chr=="1") %>%pull(trait),main="ratxcan genes in chr1",BH=FALSE,CI=FALSE,xlim=rango,ylim=rango)qqunif(df %>%filter(chr!="1") %>%pull(trait),main="ratxcan genes chr!=1",BH=FALSE,CI=FALSE,xlim=rango,ylim=rango)par(mfrow=c(1,1))qqunif(df_loco_chr01 %>%filter(chr=="1") %>%pull(trait),main="ratxcan GRM no chr1 - genes in chr1")```::: {.callout-warning}## Take homeremoving chr1 from GRM negates the correction of ratxcan in chr1.:::> phenotype was simulated to have a relatedness modeling random effect (covariance=GRM), no conexion to any of the genes. Any significant association should be considered false positive.