scATAC-seq data preprocess pipline
This pipeline is developed for scRNA-seq data pre-processing of IReNA. This pipeline analyze Cardiomyocyte data from GSE142365, and finally generates a peak-to-gene links file (in step 2) and a footprints file (in step 3) which is input of IReNA. Test data of this preprocess pipeline can be download from https://www.ncbi.nlm.nih.gov/sra?term=SRP238453
Step0: fastq-dump & cellranger
fastq-dump --split-files --gzip SRR*
for (( i = 4; i < 8; i++)); do mv SRR1074108"$i".1_1.fastq.gz P1D3MI_S1_L00"$i"_I1_001.fastq.gz ; done;
for (( i = 4; i < 8; i++)); do mv SRR1074108"$i".1_2.fastq.gz P1D3MI_S1_L00"$i"_R1_001.fastq.gz ; done;
for (( i = 4; i < 8; i++)); do mv SRR1074108"$i".1_3.fastq.gz P1D3MI_S1_L00"$i"_R2_001.fastq.gz ; done;
for (( i = 4; i < 8; i++)); do mv SRR1074108"$i".1_4.fastq.gz P1D3MI_S1_L00"$i"_R3_001.fastq.gz ; done;
cellranger-atac count --id=CM --reference=/data2/snhuang/reference/scATAC/refdata-cellranger-arc-mm10-2020-A-2.0.0 --fastqs=/data/jyjiang/Cardiomyocyte/data/scATAC/fastq --localcores=25 --localmem=64
Step1: Preprocess scATAC-seq data(dimension reduction, clustering, subset)
In this part, we preprocess the scATAC-seq data, and subset 755 Cardiomyocyte from 2564 cells.
library(ArchR)
library(IReNA)
set.seed(1)
addArchRThreads(threads = 30)
addArchRGenome("mm10")
setwd('/data/jyjiang/Cardiomyocyte/ATAC_archR')
<- createArrowFiles(
ArrowFiles inputFiles = '//data/jyjiang/Cardiomyocyte/data/scATAC/P1D3/CM/outs/fragments.tsv.gz',
sampleNames = 'CM',
minTSS = 4, #Dont set this too high because you can always increase later
minFrags = 1000,
addTileMat = TRUE,
addGeneScoreMat = TRUE,
force=T
)<- ArchRProject(
proj ArrowFiles = ArrowFiles,
outputDirectory = "/data/jyjiang/Cardiomyocyte/ATAC_archR",
copyArrows = TRUE #This is recommened so that you maintain an unaltered copy for later usage.
)<- addIterativeLSI(ArchRProj = proj, useMatrix = "TileMatrix", name = "IterativeLSI")
proj <- addClusters(input = proj, reducedDims = "IterativeLSI",resolution = 0.4,force = T)
proj <- addUMAP(ArchRProj = proj, reducedDims = "IterativeLSI")
proj <- proj@cellColData@listData$Clusters
ATAC_cluster names(ATAC_cluster) <- getCellNames(proj)
<- ATAC_cluster[ATAC_cluster %in% c('C5','C6')]
CM <- names(CM)
CMBarcode <- subsetArchRProject(proj,cells = CMBarcode,
Proj_CM outputDirectory = '/data/jyjiang/Cardiomyocyte/ATAC_archR_CM'
force = T) ,
Step2: Identify peak-to-gene links
We used the ArchR package to identify significant peak-to-gene links. First, we integrated the scRNA-seq and scATAC-seq datasets using unconstrained Integration method with the function ‘addGeneIntegrationMatrix’. Then, using the function ‘addPeak2GeneLinks’, we calculated the correlation between accessibility peak intensity and gene expression.
### preprocess peaks
<- read.delim("/data/jyjiang/Cardiomyocyte/data/scATAC/GSE142365_RAW/GSM4226355_P1D3MI_peaks.bed.gz", header=FALSE)
peak <- generate_gtf(peak)
merged_peaks <- merged_peaks[,c(1,4,5)]
merged_peaks2 rownames(merged_peaks2) <- merged_peaks[,10]
<- GRanges(paste0(merged_peaks2[,1],':',merged_peaks2[,2],'-',merged_peaks2[,3]))
peak_GR saveRDS(peak_GR,'/data/jyjiang/Cardiomyocyte/All_peak_GR.rds')
### add peak
<- readRDS('/data/jyjiang/Cardiomyocyte/All_peak_GR.rds')
peak_GR = addPeakSet(ArchRProj = Proj_CM,
Proj_CM peakSet = peak_GR, force = T)
= addPeakMatrix(
Proj_CM ArchRProj = Proj_CM,
ceiling = 4,
binarize = TRUE,
verbose = TRUE,
threads = 10,
parallelParam = NULL,
force = TRUE
)<- readRDS('/data/jyjiang/Cardiomyocyte/data/scRNA/P1_CM_3sample_seurat.rds')
seurat_obj <- addGeneIntegrationMatrix(
Proj_CM ArchRProj = Proj_CM,
useMatrix = "GeneScoreMatrix",
matrixName = "GeneIntegrationMatrix",
reducedDims = "IterativeLSI",
seRNA = seurat_obj,
addToArrow = TRUE,
force=TRUE,
groupRNA = "orig.ident",
nameCell = "predictedCell_Un",
nameGroup = "predictedGroup_Un",
nameScore = "predictedScore_Un"
)<- Process_project(Proj_CM)
Proj_CM <- addHarmony(
Proj_CM ArchRProj = Proj_CM,
reducedDims = "IterativeLSI",
name = "Harmony",
groupBy = "Sample"
)<- addPeak2GeneLinks(
Proj_CM ArchRProj = Proj_CM,
reducedDims = "Harmony",
useMatrix = "GeneIntegrationMatrix",
dimsToUse = 2:30,
knnIteration = 1500
#### ######
)= Get_p2g_fun(Proj_CM)
CM_p2g saveRDS(CM_p2g,'/data/jyjiang/Cardiomyocyte/CM_0H_48H_p2g.rds')
Step3 Call footprints by HINT
Output candidate peaks for calling footprints
<- data.frame(CM_p2g@listData[["peak"]],CM_p2g@listData[["gene"]])
pg <- pg$p2g.listData...peak...[!duplicated(pg$p2g.listData...peak...)]
p2g_peak <- t(data.frame(strsplit(p2g_peak,':')))
p2g_peak_chr <- t(data.frame(strsplit(p2g_peak_chr[,2],'-')))
p2g_peak_region <- cbind(p2g_peak_chr[,1],p2g_peak_region)
p2g_peak_chr_region write.table(p2g_peak_chr_region,'/home/jiangjunyao1/p2g_filtered_peaks.txt',quote = F,row.names = F,sep = '\t',col.names = F)
Call footprints by HINT
shell_code <- paste0('rgt-hint footprinting --atac-seq --paired-end --organism=mm10 --output-prefix=CM data/scATAC/P1D3/CM/outs/possorted_bam.bam p2g_filtered_peaks.txt')
system(shell_code,wait=TRUE)
Use tag-count score > 80th percentile as threshold to identify footprints with high quality
### R code
<- read.table('footprints.bed',sep='\t',header = T)
footprints <- footprints[footprints$V5 > quantile(footprints$V5, 0.8),]
footprints_80th <- footprints_80th[,c(1,2,3,5)]
footprints_80th write.table(footprints_80th, 'filtered_footprints.bed', quote=F, sep = '\t', row.names = F, col.names = F)