scATAC-seq data preprocess pipline

This pipeline is developed for scRNA-seq data pre-processing of IReNA. This pipeline analyze Cardiomyocyte data from GSE142365, and finally generates a peak-to-gene links file (in step 2) and a footprints file (in step 3) which is input of IReNA. Test data of this preprocess pipeline can be download from https://www.ncbi.nlm.nih.gov/sra?term=SRP238453

Step0: fastq-dump & cellranger

fastq-dump --split-files --gzip SRR*
for (( i = 4; i < 8; i++)); do mv  SRR1074108"$i".1_1.fastq.gz P1D3MI_S1_L00"$i"_I1_001.fastq.gz ; done;
for (( i = 4; i < 8; i++)); do mv  SRR1074108"$i".1_2.fastq.gz P1D3MI_S1_L00"$i"_R1_001.fastq.gz ; done;
for (( i = 4; i < 8; i++)); do mv  SRR1074108"$i".1_3.fastq.gz P1D3MI_S1_L00"$i"_R2_001.fastq.gz ; done;
for (( i = 4; i < 8; i++)); do mv  SRR1074108"$i".1_4.fastq.gz P1D3MI_S1_L00"$i"_R3_001.fastq.gz ; done;
cellranger-atac count --id=CM --reference=/data2/snhuang/reference/scATAC/refdata-cellranger-arc-mm10-2020-A-2.0.0 --fastqs=/data/jyjiang/Cardiomyocyte/data/scATAC/fastq --localcores=25 --localmem=64

Step1: Preprocess scATAC-seq data(dimension reduction, clustering, subset)

In this part, we preprocess the scATAC-seq data, and subset 755 Cardiomyocyte from 2564 cells.

library(ArchR)
library(IReNA)
set.seed(1)
addArchRThreads(threads = 30) 
addArchRGenome("mm10")

setwd('/data/jyjiang/Cardiomyocyte/ATAC_archR')
ArrowFiles <- createArrowFiles(
  inputFiles = '//data/jyjiang/Cardiomyocyte/data/scATAC/P1D3/CM/outs/fragments.tsv.gz',
  sampleNames = 'CM',
  minTSS = 4, #Dont set this too high because you can always increase later
  minFrags = 1000, 
  addTileMat = TRUE,
  addGeneScoreMat = TRUE,
  force=T
)
proj <- ArchRProject(
  ArrowFiles = ArrowFiles, 
  outputDirectory = "/data/jyjiang/Cardiomyocyte/ATAC_archR",
  copyArrows = TRUE #This is recommened so that you maintain an unaltered copy for later usage.
)
proj <- addIterativeLSI(ArchRProj = proj, useMatrix = "TileMatrix", name = "IterativeLSI")
proj <- addClusters(input = proj, reducedDims = "IterativeLSI",resolution = 0.4,force = T)
proj <- addUMAP(ArchRProj = proj, reducedDims = "IterativeLSI")
ATAC_cluster <- proj@cellColData@listData$Clusters
names(ATAC_cluster) <- getCellNames(proj)
CM <- ATAC_cluster[ATAC_cluster %in% c('C5','C6')]
CMBarcode <- names(CM)
Proj_CM <- subsetArchRProject(proj,cells = CMBarcode,
                                        outputDirectory = '/data/jyjiang/Cardiomyocyte/ATAC_archR_CM'
                                        ,force = T)

Step3 Call footprints by HINT

Output candidate peaks for calling footprints

pg <- data.frame(CM_p2g@listData[["peak"]],CM_p2g@listData[["gene"]])
p2g_peak <- pg$p2g.listData...peak...[!duplicated(pg$p2g.listData...peak...)]
p2g_peak_chr <- t(data.frame(strsplit(p2g_peak,':')))
p2g_peak_region <- t(data.frame(strsplit(p2g_peak_chr[,2],'-')))
p2g_peak_chr_region <- cbind(p2g_peak_chr[,1],p2g_peak_region)
write.table(p2g_peak_chr_region,'/home/jiangjunyao1/p2g_filtered_peaks.txt',quote = F,row.names = F,sep = '\t',col.names = F)

Call footprints by HINT

shell_code <- paste0('rgt-hint footprinting --atac-seq --paired-end --organism=mm10 --output-prefix=CM data/scATAC/P1D3/CM/outs/possorted_bam.bam p2g_filtered_peaks.txt')
system(shell_code,wait=TRUE)

Use tag-count score > 80th percentile as threshold to identify footprints with high quality

### R code
footprints <- read.table('footprints.bed',sep='\t',header = T)
footprints_80th <- footprints[footprints$V5 > quantile(footprints$V5, 0.8),]
footprints_80th <- footprints_80th[,c(1,2,3,5)]
write.table(footprints_80th, 'filtered_footprints.bed', quote=F, sep = '\t', row.names = F, col.names = F)