scATAC-seq data preprocess pipline

This pipeline is developed for scRNA-seq data pre-processing of IReNA. This pipeline analyze Cardiomyocyte data from GSE142365, and finally generates a peak-to-gene links file (in step 2) and a footprints file (in step 3) which is input of IReNA. Test data of this preprocess pipeline can be download from https://www.ncbi.nlm.nih.gov/sra?term=SRP238453

Step0: fastq-dump & cellranger

fastq-dump --split-files --gzip SRR*
for (( i = 4; i < 8; i++)); do mv  SRR1074108"$i".1_1.fastq.gz P1D3MI_S1_L00"$i"_I1_001.fastq.gz ; done;
for (( i = 4; i < 8; i++)); do mv  SRR1074108"$i".1_2.fastq.gz P1D3MI_S1_L00"$i"_R1_001.fastq.gz ; done;
for (( i = 4; i < 8; i++)); do mv  SRR1074108"$i".1_3.fastq.gz P1D3MI_S1_L00"$i"_R2_001.fastq.gz ; done;
for (( i = 4; i < 8; i++)); do mv  SRR1074108"$i".1_4.fastq.gz P1D3MI_S1_L00"$i"_R3_001.fastq.gz ; done;
cellranger-atac count --id=CM --reference=/data2/snhuang/reference/scATAC/refdata-cellranger-arc-mm10-2020-A-2.0.0 --fastqs=/data/jyjiang/Cardiomyocyte/data/scATAC/fastq --localcores=25 --localmem=64

Step1: Preprocess scATAC-seq data(dimension reduction, clustering, subset)

In this part, we preprocess the scATAC-seq data, and subset 755 Cardiomyocyte from 2564 cells.

library(ArchR)
library(IReNA)
set.seed(1)
addArchRThreads(threads = 30) 
addArchRGenome("mm10")

setwd('/data/jyjiang/Cardiomyocyte/ATAC_archR')
ArrowFiles <- createArrowFiles(
  inputFiles = '//data/jyjiang/Cardiomyocyte/data/scATAC/P1D3/CM/outs/fragments.tsv.gz',
  sampleNames = 'CM',
  minTSS = 4, #Dont set this too high because you can always increase later
  minFrags = 1000, 
  addTileMat = TRUE,
  addGeneScoreMat = TRUE,
  force=T
)
proj <- ArchRProject(
  ArrowFiles = ArrowFiles, 
  outputDirectory = "/data/jyjiang/Cardiomyocyte/ATAC_archR",
  copyArrows = TRUE #This is recommened so that you maintain an unaltered copy for later usage.
)
proj <- addIterativeLSI(ArchRProj = proj, useMatrix = "TileMatrix", name = "IterativeLSI")
proj <- addClusters(input = proj, reducedDims = "IterativeLSI",resolution = 0.4,force = T)
proj <- addUMAP(ArchRProj = proj, reducedDims = "IterativeLSI")
ATAC_cluster <- proj@cellColData@listData$Clusters
names(ATAC_cluster) <- getCellNames(proj)
CM <- ATAC_cluster[ATAC_cluster %in% c('C5','C6')]
CMBarcode <- names(CM)
Proj_CM <- subsetArchRProject(proj,cells = CMBarcode,
                                        outputDirectory = '/data/jyjiang/Cardiomyocyte/ATAC_archR_CM'
                                        ,force = T)

Step2: Identify peak-to-gene links

We used the ArchR package to identify significant peak-to-gene links. First, we integrated the scRNA-seq and scATAC-seq datasets using unconstrained Integration method with the function ‘addGeneIntegrationMatrix’. Then, using the function ‘addPeak2GeneLinks’, we calculated the correlation between accessibility peak intensity and gene expression.

### preprocess peaks
peak <- read.delim("/data/jyjiang/Cardiomyocyte/data/scATAC/GSE142365_RAW/GSM4226355_P1D3MI_peaks.bed.gz", header=FALSE)
merged_peaks <- generate_gtf(peak)
merged_peaks2 <- merged_peaks[,c(1,4,5)]
rownames(merged_peaks2) <- merged_peaks[,10]
peak_GR <- GRanges(paste0(merged_peaks2[,1],':',merged_peaks2[,2],'-',merged_peaks2[,3]))
saveRDS(peak_GR,'/data/jyjiang/Cardiomyocyte/All_peak_GR.rds')
### add peak
peak_GR <- readRDS('/data/jyjiang/Cardiomyocyte/All_peak_GR.rds')
Proj_CM = addPeakSet(ArchRProj = Proj_CM, 
                               peakSet = peak_GR, force = T)
Proj_CM = addPeakMatrix(
  ArchRProj = Proj_CM,
  ceiling = 4,
  binarize = TRUE,
  verbose = TRUE,
  threads = 10,
  parallelParam = NULL,
  force = TRUE
)
seurat_obj <- readRDS('/data/jyjiang/Cardiomyocyte/data/scRNA/P1_CM_3sample_seurat.rds')
Proj_CM <- addGeneIntegrationMatrix(
  ArchRProj = Proj_CM, 
  useMatrix = "GeneScoreMatrix",
  matrixName = "GeneIntegrationMatrix",
  reducedDims = "IterativeLSI",
  seRNA = seurat_obj,
  addToArrow = TRUE,
  force=TRUE,
  groupRNA = "orig.ident",
  nameCell = "predictedCell_Un",
  nameGroup = "predictedGroup_Un",
  nameScore = "predictedScore_Un"
)
Proj_CM <- Process_project(Proj_CM)
Proj_CM <- addHarmony(
  ArchRProj = Proj_CM,
  reducedDims = "IterativeLSI",
  name = "Harmony",
  groupBy = "Sample"
)
Proj_CM <- addPeak2GeneLinks(
  ArchRProj = Proj_CM,
  reducedDims = "Harmony",
  useMatrix = "GeneIntegrationMatrix",
  dimsToUse = 2:30,
  knnIteration = 1500
  #### ######
)
CM_p2g = Get_p2g_fun(Proj_CM)
saveRDS(CM_p2g,'/data/jyjiang/Cardiomyocyte/CM_0H_48H_p2g.rds')

Step3 Call footprints by HINT

Output candidate peaks for calling footprints

pg <- data.frame(CM_p2g@listData[["peak"]],CM_p2g@listData[["gene"]])
p2g_peak <- pg$p2g.listData...peak...[!duplicated(pg$p2g.listData...peak...)]
p2g_peak_chr <- t(data.frame(strsplit(p2g_peak,':')))
p2g_peak_region <- t(data.frame(strsplit(p2g_peak_chr[,2],'-')))
p2g_peak_chr_region <- cbind(p2g_peak_chr[,1],p2g_peak_region)
write.table(p2g_peak_chr_region,'/home/jiangjunyao1/p2g_filtered_peaks.txt',quote = F,row.names = F,sep = '\t',col.names = F)

Call footprints by HINT

shell_code <- paste0('rgt-hint footprinting --atac-seq --paired-end --organism=mm10 --output-prefix=CM data/scATAC/P1D3/CM/outs/possorted_bam.bam p2g_filtered_peaks.txt')
system(shell_code,wait=TRUE)

Use tag-count score > 80th percentile as threshold to identify footprints with high quality

### R code
footprints <- read.table('footprints.bed',sep='\t',header = T)
footprints_80th <- footprints[footprints$V5 > quantile(footprints$V5, 0.8),]
footprints_80th <- footprints_80th[,c(1,2,3,5)]
write.table(footprints_80th, 'filtered_footprints.bed', quote=F, sep = '\t', row.names = F, col.names = F)

scRNA-seq preprocessing pipeline (used for IReNA)

Jiang junyao
Guangzhou Institutes of Biomedicine and Health, Chinese Academy of Sciences

scATAC-seq data preprocess pipline

Step0: fastq-dump & cellranger

Step1: Preprocess scATAC-seq data(dimension reduction, clustering, subset)

Step2: Identify peak-to-gene links

Step3 Call footprints by HINT

scRNA-seq preprocessing pipeline (used for IReNA)

Jiang junyao Guangzhou Institutes of Biomedicine and Health, Chinese Academy of Sciences

scATAC-seq data preprocess pipline

Step0: fastq-dump & cellranger

Step1: Preprocess scATAC-seq data(dimension reduction, clustering, subset)

Step2: Identify peak-to-gene links

Step3 Call footprints by HINT

Jiang junyao
Guangzhou Institutes of Biomedicine and Health, Chinese Academy of Sciences