Author

Hae Kyung Im

Published

August 2, 2020

Modified

June 17, 2024

GWAS catalog

Code
suppressMessages(library(tidyverse))

suppressMessages(library(tidyverse))
suppressMessages(library(glue))
PRE = "/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"

SLUG="gwas-catalog" ## copy the slug from the header
bDATE='2020-08-02' ## copy the date from the blog's header here
DATA = glue("{PRE}/{bDATE}-{SLUG}")
if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))
WORK=DATA

##  system(glue("open {DATA}")) ## this will open the folder 
Code
## download fron https://www.ebi.ac.uk/gwas/api/search/downloads/alternative

## DATA = "/Users/haekyungim/Box/LargeFiles/imlab-data/data-Github/analysis-hki"
gwascat = read_tsv(glue("{DATA}/gwas_catalog_v1.0.2-associations_e109_r2023-06-03.tsv.gz"), guess_max = 100000)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 529481 Columns: 38
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr  (28): FIRST AUTHOR, JOURNAL, LINK, STUDY, DISEASE/TRAIT, INITIAL SAMPLE...
dbl   (8): PUBMEDID, UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE, MERGE...
date  (2): DATE ADDED TO CATALOG, DATE

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
##"gwas_catalog_v1.0.2-associations_e100_r2020-08-05.tsv"
##"gwas_catalog_v1.0.2-associations_e100_r2020-07-14.tsv"
##"gwas_catalog_v1.0.2-associations_e100_r2020-08-05.tsv"

dim(gwascat)
[1] 529481     38
Code
glimpse(gwascat)
Rows: 529,481
Columns: 38
$ `DATE ADDED TO CATALOG`      <date> 2022-07-04, 2022-07-04, 2022-07-04, 2022…
$ PUBMEDID                     <dbl> 33462482, 33462482, 33462482, 33462482, 3…
$ `FIRST AUTHOR`               <chr> "Ruhlemann MC", "Ruhlemann MC", "Ruhleman…
$ DATE                         <date> 2021-01-18, 2021-01-18, 2021-01-18, 2021…
$ JOURNAL                      <chr> "Nat Genet", "Nat Genet", "Nat Genet", "N…
$ LINK                         <chr> "www.ncbi.nlm.nih.gov/pubmed/33462482", "…
$ STUDY                        <chr> "Genome-wide association study in 8,956 G…
$ `DISEASE/TRAIT`              <chr> "TestASV_20 (Phascolarctobacterium) preva…
$ `INITIAL SAMPLE SIZE`        <chr> "8,956 German ancestry individuals", "8,9…
$ `REPLICATION SAMPLE SIZE`    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ REGION                       <chr> "9p21.3", NA, NA, "17q24.3", "20q13.31", …
$ CHR_ID                       <chr> "9", NA, NA, "17", "20", "3", NA, NA, "11…
$ CHR_POS                      <chr> "22175189", NA, NA, "72438652", "56989649…
$ `REPORTED GENE(S)`           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ MAPPED_GENE                  <chr> "CDKN2B-AS1 - DMRTA1", NA, NA, "LINC00511…
$ UPSTREAM_GENE_ID             <chr> "ENSG00000240498", NA, NA, NA, "ENSG00000…
$ DOWNSTREAM_GENE_ID           <chr> "ENSG00000176399", NA, NA, NA, "ENSG00000…
$ SNP_GENE_IDS                 <chr> NA, NA, NA, "ENSG00000227036", NA, "ENSG0…
$ UPSTREAM_GENE_DISTANCE       <dbl> 47086, NA, NA, NA, 203562, NA, NA, NA, NA…
$ DOWNSTREAM_GENE_DISTANCE     <dbl> 271635, NA, NA, NA, 179104, NA, NA, NA, N…
$ `STRONGEST SNP-RISK ALLELE`  <chr> "rs10965279-?", "chr11:60833276-?", "chr1…
$ SNPS                         <chr> "rs10965279", "chr11:60833276", "chr13:43…
$ MERGED                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SNP_ID_CURRENT               <chr> "10965279", NA, NA, "7223271", "910832", …
$ CONTEXT                      <chr> "intergenic_variant", NA, NA, "intron_var…
$ INTERGENIC                   <dbl> 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,…
$ `RISK ALLELE FREQUENCY`      <chr> "NR", "NR", "NR", "NR", "NR", "NR", "NR",…
$ `P-VALUE`                    <dbl> 5e-08, 2e-06, 9e-06, 9e-06, 8e-06, 3e-06,…
$ PVALUE_MLOG                  <dbl> 7.301030, 5.698970, 5.045757, 5.045757, 5…
$ `P-VALUE (TEXT)`             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ `OR or BETA`                 <dbl> 0.5047070, 0.3521481, 0.3854320, 0.427226…
$ `95% CI (TEXT)`              <chr> "[0.32-0.69] unit increase", "[0.21-0.5] …
$ `PLATFORM [SNPS PASSING QC]` <chr> "NR [6900000] (imputed)", "NR [6900000] (…
$ CNV                          <chr> "N", "N", "N", "N", "N", "N", "N", "N", "…
$ MAPPED_TRAIT                 <chr> "gut microbiome measurement", "gut microb…
$ MAPPED_TRAIT_URI             <chr> "http://www.ebi.ac.uk/efo/EFO_0007874", "…
$ `STUDY ACCESSION`            <chr> "GCST90011694", "GCST90011694", "GCST9001…
$ `GENOTYPING TECHNOLOGY`      <chr> "Genome-wide genotyping array", "Genome-w…
Code
gwascat %>% count(MAPPED_TRAIT,CHR_POS) %>% dim()
[1] 373864      3
Code
length(unique(gwascat$MAPPED_TRAIT))
[1] 8723
Code
length(unique(gwascat$CHR_POS))
[1] 256855
Code
## 2023-06-14 256855 distinct trait/variants
## 2023-06-14 8723 distinct traits
## 2020 146,359 distinct trait/variant
## 2020 3,758 distinct traits

gwascat_sig = gwascat %>% mutate(year=as.factor(lubridate::year(lubridate::as_date(`DATE ADDED TO CATALOG`)))) %>% filter(`P-VALUE`<5e-8)

gwascat_sig %>% filter(year!="2023") %>% ggplot(aes(year)) + geom_bar() + theme_bw(base_size = 15) + scale_x_discrete(breaks=c("2008","2012","2016","2020","2022")) + xlab("year") + ylab("GWAS loci reported p<5e-8") + ggtitle("GWAS Catalog Downloaded 2023-06-14")

Code
##ggsave(glue::glue("{DATA}/gwas-catalog/gwas-catalog-by-year.pdf"))
  • number of significant SNPs
Code
gwascat_sig %>% count(CHR_POS) %>% dim()
[1] 185850      2
  • number of significant trait/SNP pairs
Code
gwascat_sig %>% count(CHR_POS,MAPPED_TRAIT) %>% dim()
[1] 288566      3

in 2020 [1] 94664 3

  • number of traits with significant SNPs
Code
gwascat_sig %>% count(MAPPED_TRAIT) %>% dim()
[1] 7427    2

in 2020 [1] 2584 2

  • number of unique studies
Code
gwascat %>% count(STUDY) %>% dim()
[1] 5605    2