Cistrome DB data

Author

Haky Im

Published

March 28, 2023

suppressMessages(library(tidyverse))
suppressMessages(library(glue))
PRE = "/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"
##PRE="/Users/margaretperry/Library/CloudStorage/Box-Box/imlab-data/data-Github/web-data "
##PRE="/Users/temi/Library/CloudStorage/Box-Box/imlab-data/data-Github/web-data"
## COPY THE DATE AND SLUG fields FROM THE HEADER
SLUG="cistromedb-data" ## copy the slug from the header
bDATE='2023-03-28' ## copy the date from the blog's header here
DATA = glue("{PRE}/{bDATE}-{SLUG}")
if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))
WORK=DATA

## move data to DATA
#tempodata=("~/Downloads/tempo/gwas_catalog_v1.0.2-associations_e105_r2022-04-07.tsv")
#system(glue::glue("cp {tempodata} {DATA}/"))

## system(glue("open {DATA}")) ## this will open the folder 
data = read_tsv(glue("{DATA}/human_factor_full_QC.txt"))
Rows: 11348 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): Species, GSMID, Factor, Cell_line, Cell_type, Tissue_type
dbl (7): DCid, FastQC, UniquelyMappedRatio, PBC, PeaksFoldChangeAbove10, FRi...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(data)
 [1] "DCid"                   "Species"                "GSMID"                 
 [4] "Factor"                 "Cell_line"              "Cell_type"             
 [7] "Tissue_type"            "FastQC"                 "UniquelyMappedRatio"   
[10] "PBC"                    "PeaksFoldChangeAbove10" "FRiP"                  
[13] "PeaksUnionDHSRatio"    
data %>% select(Factor,Cell_line,Cell_type,Tissue_type) %>% unique() %>% dim()
[1] 4426    4
data %>% count(Factor,Cell_line,Cell_type,Tissue_type) %>% arrange(desc(n))
# A tibble: 4,426 × 5
   Factor Cell_line Cell_type  Tissue_type     n
   <chr>  <chr>     <chr>      <chr>       <int>
 1 ESR1   MCF-7     Epithelium Breast        213
 2 AR     LNCaP     Epithelium Prostate      143
 3 POLR2A HeLa      Epithelium Cervix         76
 4 AR     VCaP      Epithelium Prostate       64
 5 POLR2A MCF-7     Epithelium Breast         64
 6 NR3C1  A549      Epithelium Lung           46
 7 POLR2A HCT-116   None       HCT116         46
 8 CTCF   MCF-7     Epithelium Breast         45
 9 FOXA1  LNCaP     Epithelium Prostate       45
10 ESR1   None      None       Breast         42
# ℹ 4,416 more rows
data %>% count(Factor,Cell_line,Cell_type,Tissue_type) %>% .[["n"]] %>% table()
.
   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
2223 1196  299  282   89  106   36   43   18   28   12   17    4    8    7    6 
  17   18   19   20   22   23   24   25   26   28   29   30   31   32   33   34 
   1    8    3    2    1    1    2    3    2    1    1    3    2    2    3    3 
  37   38   40   42   45   46   64   76  143  213 
   2    1    1    1    2    2    2    1    1    1 
## are cell line==None non tumor?
data %>% filter(Cell_line=="None") %>% dim()
[1] 1817   13
## how many unique cell lines?
data %>% count(Cell_line) %>% dim()
[1] 520   2
## how many unique cell types?
data %>% count(Cell_type) %>% dim()
[1] 153   2
## how many unique tissue types?
data %>% count(Tissue_type) %>% dim()
[1] 84  2