suppressMessages(library(tidyverse))suppressMessages(library(glue))PRE ="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"SLUG="how-to-read-gencode-gtf-file"## copy the slug from the headerbDATE='2023-04-27'## copy the date from the blog's header hereDATA =glue("{PRE}/{bDATE}-{SLUG}")if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))WORK=DATA## move data to DATA#tempodata=("~/Downloads/tempo/gwas_catalog_v1.0.2-associations_e105_r2022-04-07.tsv")#system(glue::glue("cp {tempodata} {DATA}/"))#system(glue("open {DATA}")) ## this will open the folder
install if necessary
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("rtracklayer")
Code
# Load the rtracklayer packagelibrary(rtracklayer)# Define the path to your GFF filegff_file_path <-glue("{DATA}/gencode.v43.annotation.gff3.gz")# Read the GFF filegff_data <-import(gff_file_path, format ="gff3")# Convert the GFF data to a data framegff_data_frame <-as.data.frame(gff_data)# Display the first few rows of the data framehead(gff_data_frame)
Code
"ENSG00000164308"## all gene_id start with ENSGtable(substr(gff_data_frame$gene_id,1,4))## all entries have a gene_iddim(gff_data_frame)## number of genes with positive vs negative strands
to load version 75 of ensembl
Code
library(biomaRt)# mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", # host = "https://dec2013.archive.ensembl.org", # This host corresponds to Ensembl version 75# path = "/biomart/martservice")mart <-useMart(biomart ="ENSEMBL_MART_ENSEMBL", host ="https://feb2014.archive.ensembl.org", # This host corresponds to Ensembl version 75path ="/biomart/martservice")
---title: "How to read gencode gtf file"author: "Haky Im"date: "2023-04-27"categories: [analysis,how_to]format: html: code-fold: true code-summary: "Show the code"editor_options: chunk_output_type: console---```{r}suppressMessages(library(tidyverse))suppressMessages(library(glue))PRE ="/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"SLUG="how-to-read-gencode-gtf-file"## copy the slug from the headerbDATE='2023-04-27'## copy the date from the blog's header hereDATA =glue("{PRE}/{bDATE}-{SLUG}")if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))WORK=DATA## move data to DATA#tempodata=("~/Downloads/tempo/gwas_catalog_v1.0.2-associations_e105_r2022-04-07.tsv")#system(glue::glue("cp {tempodata} {DATA}/"))#system(glue("open {DATA}")) ## this will open the folder ```- [ ] install if necessary``` if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")BiocManager::install("rtracklayer")``````{r eval=FALSE}# Load the rtracklayer packagelibrary(rtracklayer)# Define the path to your GFF filegff_file_path <-glue("{DATA}/gencode.v43.annotation.gff3.gz")# Read the GFF filegff_data <-import(gff_file_path, format ="gff3")# Convert the GFF data to a data framegff_data_frame <-as.data.frame(gff_data)# Display the first few rows of the data framehead(gff_data_frame)``````{r eval=FALSE}"ENSG00000164308"## all gene_id start with ENSGtable(substr(gff_data_frame$gene_id,1,4))## all entries have a gene_iddim(gff_data_frame)## number of genes with positive vs negative strands```to load version 75 of ensembl ```{r eval=FALSE}library(biomaRt)# mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", # host = "https://dec2013.archive.ensembl.org", # This host corresponds to Ensembl version 75# path = "/biomart/martservice")mart <-useMart(biomart ="ENSEMBL_MART_ENSEMBL", host ="https://feb2014.archive.ensembl.org", # This host corresponds to Ensembl version 75path ="/biomart/martservice")```