suppressMessages(library(tidyverse))
suppressMessages(library(glue))
= "/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"
PRE ##PRE="/Users/margaretperry/Library/CloudStorage/Box-Box/imlab-data/data-Github/web-data "
##PRE="/Users/temi/Library/CloudStorage/Box-Box/imlab-data/data-Github/web-data"
## COPY THE DATE AND SLUG fields FROM THE HEADER
="cistromedb-data" ## copy the slug from the header
SLUG='2023-03-28' ## copy the date from the blog's header here
bDATE= glue("{PRE}/{bDATE}-{SLUG}")
DATA if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))
=DATA
WORK
## move data to DATA
#tempodata=("~/Downloads/tempo/gwas_catalog_v1.0.2-associations_e105_r2022-04-07.tsv")
#system(glue::glue("cp {tempodata} {DATA}/"))
## system(glue("open {DATA}")) ## this will open the folder
Cistrome DB data
= read_tsv(glue("{DATA}/human_factor_full_QC.txt")) data
Rows: 11348 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): Species, GSMID, Factor, Cell_line, Cell_type, Tissue_type
dbl (7): DCid, FastQC, UniquelyMappedRatio, PBC, PeaksFoldChangeAbove10, FRi...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(data)
[1] "DCid" "Species" "GSMID"
[4] "Factor" "Cell_line" "Cell_type"
[7] "Tissue_type" "FastQC" "UniquelyMappedRatio"
[10] "PBC" "PeaksFoldChangeAbove10" "FRiP"
[13] "PeaksUnionDHSRatio"
%>% select(Factor,Cell_line,Cell_type,Tissue_type) %>% unique() %>% dim() data
[1] 4426 4
%>% count(Factor,Cell_line,Cell_type,Tissue_type) %>% arrange(desc(n)) data
# A tibble: 4,426 × 5
Factor Cell_line Cell_type Tissue_type n
<chr> <chr> <chr> <chr> <int>
1 ESR1 MCF-7 Epithelium Breast 213
2 AR LNCaP Epithelium Prostate 143
3 POLR2A HeLa Epithelium Cervix 76
4 AR VCaP Epithelium Prostate 64
5 POLR2A MCF-7 Epithelium Breast 64
6 NR3C1 A549 Epithelium Lung 46
7 POLR2A HCT-116 None HCT116 46
8 CTCF MCF-7 Epithelium Breast 45
9 FOXA1 LNCaP Epithelium Prostate 45
10 ESR1 None None Breast 42
# ℹ 4,416 more rows
%>% count(Factor,Cell_line,Cell_type,Tissue_type) %>% .[["n"]] %>% table() data
.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
2223 1196 299 282 89 106 36 43 18 28 12 17 4 8 7 6
17 18 19 20 22 23 24 25 26 28 29 30 31 32 33 34
1 8 3 2 1 1 2 3 2 1 1 3 2 2 3 3
37 38 40 42 45 46 64 76 143 213
2 1 1 1 2 2 2 1 1 1
## are cell line==None non tumor?
%>% filter(Cell_line=="None") %>% dim() data
[1] 1817 13
## how many unique cell lines?
%>% count(Cell_line) %>% dim() data
[1] 520 2
## how many unique cell types?
%>% count(Cell_type) %>% dim() data
[1] 153 2
## how many unique tissue types?
%>% count(Tissue_type) %>% dim() data
[1] 84 2
Reuse
© HakyImLab and Listed Authors - CC BY 4.0 for Text and figures - MIT for code