PredictDB weight distribution

analysis
Author

Haky Im

Published

February 28, 2023

Goal: get effect size distribution of omic traits

Code
suppressMessages(library(tidyverse))
suppressMessages(library(glue))
PRE = "/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"
##PRE="/Users/margaretperry/Library/CloudStorage/Box-Box/imlab-data/data-Github/web-data "
##PRE="/Users/temi/Library/CloudStorage/Box-Box/imlab-data/data-Github/web-data"
## COPY THE DATE AND SLUG fields FROM THE HEADER
SLUG="predictdb-weight-distribution" ## copy the slug from the header
bDATE='2023-02-28' ## copy the date from the blog's header here
DATA = glue("{PRE}/{bDATE}-{SLUG}")
if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))
WORK=DATA
USERNAME="haekyungim"
## move data to DATA
#tempodata=("~/Downloads/tempo/gwas_catalog_v1.0.2-associations_e105_r2022-04-07.tsv")
#system(glue::glue("cp {tempodata} {DATA}/"))
## system(glue("open {DATA}")) ## this will open the folder 
Code
if(F)
{
  setwd(DATA)
  system(glue("wget https://zenodo.org/record/3518299/files/mashr_eqtl.tar?download=1"))
  system(glue("tar xvf mashr_eqtl.tar"))
  system(glue("cd eqtl"))
  system("rm *.txt.gz")
}
Code
library("RSQLite")
sqlite <- dbDriver("SQLite")

myshowdist = function(dbname,titulo="n=")
{
print(dbname)
db = dbConnect(sqlite,dbname)
weights = dbGetQuery(db, "select * from weights")
absweivec = abs(weights$weight)
qlines = quantile(absweivec,c(0.2,0.5)) %>% round(2)
quantile(absweivec,(0:10)/10) %>% round(2) %>% print()
pp <- weights %>% mutate(abswei=abs(weight)) %>% ggplot(aes(abswei))+geom_histogram() + geom_vline(xintercept=qlines) +ggtitle(glue("{titulo}; 20%={qlines[1]}, median={qlines[2]}"))
print(pp)
dbDisconnect(db)
}
Code
dbname <- glue("{DATA}/eqtl/mashr_Adipose_Subcutaneous.db") ## add full path if db file not in current directory
myshowdist(dbname,titulo="Adipose expr n=491")
/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-02-28-predictdb-weight-distribution/eqtl/mashr_Adipose_Subcutaneous.db
  0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
0.00 0.02 0.05 0.08 0.12 0.15 0.20 0.25 0.34 0.50 2.47 
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Code
dbname <- glue("{DATA}/eqtl/mashr_Brain_Substantia_nigra.db") ## add full path if db file not in current directory
myshowdist(dbname,titulo="Brain SN expr - n=101")
/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-02-28-predictdb-weight-distribution/eqtl/mashr_Brain_Substantia_nigra.db
  0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
0.00 0.01 0.02 0.04 0.07 0.10 0.14 0.20 0.29 0.44 2.56 
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Code
## can download from https://uchicago.box.com/shared/static/m3zsxy3oy8kn5gkktkuo8lvui269hxi5.db
print("AA")
[1] "AA"
Code
dbname <- glue::glue("/Users/{USERNAME}/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/Within-Lab-Sharing/Sabrina-Data/ARIC/ARIC_AA_hg38.db")
myshowdist(dbname,titulo="protein ARIC AA n=1,871")
/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/Within-Lab-Sharing/Sabrina-Data/ARIC/ARIC_AA_hg38.db
  0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
0.00 0.00 0.00 0.00 0.00 0.01 0.01 0.01 0.02 0.03 0.59 
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Code
## can download from https://uchicago.box.com/shared/static/m3zsxy3oy8kn5gkktkuo8lvui269hxi5.db
print("EUR")
[1] "EUR"
Code
dbname <- glue::glue("/Users/{USERNAME}/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/Within-Lab-Sharing/Sabrina-Data/ARIC/ARIC_EA_hg38.db")
myshowdist(dbname,titulo="protein ARIC EA n=7,213")
/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/Within-Lab-Sharing/Sabrina-Data/ARIC/ARIC_EA_hg38.db
  0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
0.00 0.00 0.00 0.00 0.00 0.01 0.01 0.01 0.01 0.03 0.55 
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Code
##install.packages("arrow")
tempo = arrow::read_parquet(glue::glue("{DATA}/brainxcan-gw_lasso_beta.parquet"))
tempo = as.matrix(tempo %>% select(starts_with("IDP")))
kk = abs(tempo[tempo!=0])
print(summary(kk))
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.000000 0.000363 0.000909 0.001366 0.001849 0.065254 
Code
print(quantile(kk,(0:10)/10))
          0%          10%          20%          30%          40%          50% 
1.513009e-11 1.207948e-04 2.761802e-04 4.559484e-04 6.659417e-04 9.090500e-04 
         60%          70%          80%          90%         100% 
1.205991e-03 1.594788e-03 2.162211e-03 3.165862e-03 6.525400e-02 
Code
print(quantile(kk,c(0.2,0.5)))
         20%          50% 
0.0002761802 0.0009090500 

get protein predictors with FDR < 0.05

just need to count the number of prediction models

Code
print("Protein ARIC models AA")
[1] "Protein ARIC models AA"
Code
dbname <- glue::glue("/Users/{USERNAME}/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/Within-Lab-Sharing/Sabrina-Data/ARIC/ARIC_AA_hg38.db")
print(dbname)
/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/Within-Lab-Sharing/Sabrina-Data/ARIC/ARIC_AA_hg38.db
Code
db = dbConnect(sqlite,dbname)
##weights = dbGetQuery(db, "select * from weights")
extra = dbGetQuery(db,"select * from extra")
nrow(extra)
[1] 1368
Code
dbDisconnect(db)

print("Protein ARIC models EUR")
[1] "Protein ARIC models EUR"
Code
dbname <- glue::glue("/Users/{USERNAME}/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/Within-Lab-Sharing/Sabrina-Data/ARIC/ARIC_EA_hg38.db")
print(dbname)
/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/Within-Lab-Sharing/Sabrina-Data/ARIC/ARIC_EA_hg38.db
Code
db = dbConnect(sqlite,dbname)
##weights = dbGetQuery(db, "select * from weights")
extra = dbGetQuery(db,"select * from extra")
nrow(extra)
[1] 1318
Code
dbDisconnect(db)
Code
dbname <- glue("{DATA}/en_Adipose_Subcutaneous.db") ## add full path if db file not in current directory
print("Adipose models")
[1] "Adipose models"
Code
print(dbname)
/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-02-28-predictdb-weight-distribution/en_Adipose_Subcutaneous.db
Code
db = dbConnect(sqlite,dbname)
##weights = dbGetQuery(db, "select * from weights")
extra = dbGetQuery(db,"select * from extra")
nrow(extra)
[1] 8650
Code
dbDisconnect(db)


dbname <- glue("{DATA}/en_Brain_Substantia_nigra.db") ## add full path if db file not in current directory
print("Brain Substantia nigra models")
[1] "Brain Substantia nigra models"
Code
print(dbname)
/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data/2023-02-28-predictdb-weight-distribution/en_Brain_Substantia_nigra.db
Code
db = dbConnect(sqlite,dbname)
##weights = dbGetQuery(db, "select * from weights")
extra = dbGetQuery(db,"select * from extra")
nrow(extra)
[1] 2559
Code
dbDisconnect(db)