SMART-total cfRNA-seq Data Analysis

1.1 Trim adaptor

cutadapt --pair-filter any  -q 30,30 \
            -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT \
            --trim-n -m 16  -o >(gzip -c > {output.fastq1}) -p >(gzip -c > {output.fastq2}) \
            {input.fastq1} {input.fastq2} > {log} 2>&1

1.2 Trim GC oligo introduced in template switching

2.1 Reads alignment

2.2 Remove duplication in circRNA.bam and genome.bam with picard tools

java -jar {picardDir}/picard.jar \
            MarkDuplicates REMOVE_DUPLICATES=true \
            ASSUME_SORT_ORDER=queryname \
            I={input.bam} \
            O={output.bam} \
            M={output.metrics} \
            READ_NAME_REGEX=null

2.3 Get intron spanning reads from genome aligned reads

3.1 Assign reads in genome bam file to certain genomic regions

3.2 Quantify gene and circRNA expression

# Quantify gene expression
# For forward stranded libraries,strandness=1, for reverse stranded libraries, strandness=2 
featureCounts -O -t exon -g gene_id -M -s {strandness} -p -a {gtf} -o {counts} {bam} > {log}
# Quantify circRNA expression
# Strandness: forward, reverse
bin/count_reads.py count_circrna -s {strandness} --paired-end -i {inbam} -o {output}

4.1 Calculate PSI scores

## Running rMATs (rMATS.4.0.2/rMATS-turbo-Linux-UCS4)
python2 {rmats_path} --b1 {pos_bam_paths} --b2 {neg_bam_paths} --gtf {gtfFile} --od {outdir} -t paired --readLength 150
## Summarize rMATs results for {splicing_type} (one of MXE A3SS A5SS SE RI)
python3 bin/summarize-splicing.py --input {splicing_type}.MATS.JC.txt  --outdir outdir --type {splicing_type} --method JC  --pos pos_ids.txt  --neg  neg_ids.txt

4.2 Calculate PDUI scores

## Sort bam file by coordinate
samtools sort  -o {bamUnsorted} {bamSorted}
samtools index  {bamSorted}

## Calculate coverage, strand in +,-
bedtools genomecov -strand {strand}  -split -ibam {bamSorted} \
            | LC_COLLATE=C sort -T {temp_dir} -k1,1 -k2,2n > {bedgraph}
bedGraphToBigWig {bedgraph} {chrom_sizes} {bigwig}

## Merge Coverage of + strand and - strand
bigWigMerge {bigwigPosStrand} {bigwigNegStrand} {bedgraphMerged}
sort -k1,1 -k2,2n {bedgraphMerged} > {bedgraphMergedSorted}
bedGraphToBigWig {bedgraphMergedSorted}  {chrom_sizes} {bigwigMerged}
bigWigToWig {bigwigMerged} {wigMerged}

## Run DaPar
{DaParsDir}/src/DaPars_main.py {config}

## Summarize results
python  bin/parseDapar.py -i {input} -c {config} -l {long} -s {short} -p {PDUI}

4.3 Calcualte RNA editing level by gene


## Identify recurrent RNA editing sites
## For each sample, run
{RNAEditorDir}/RNAEditor.py  -i  {fastq_1} {fastq_2} -c {config}

# Calculate detection recurrency for each editing sites:  bin/recurrent-editing.py

# Filter intergenic editing sites: bin/get-intragene-editing-sites.sh

## Calculate coverage for each sample at recurrently edited sites, include samples which no editing events were reported by RNAEditor
samtools mpileup -l  editing-sites-pos-list.txt -o ${output} --reference ${ref} ${bam}

# Parse pileup result (get reads number support editing/support not editing)
python bin/cal-coverage.py -i {pileup} -o {coverage}

# Summarize the coverage by genomic positions: bin/summarize-editing-coverage.py

# Calculate editing level for each gene and each editng site: bin/editing-level.py 

4.4 Classification of unmapped reads

## Classify unmapped reads with kraken2
kraken2 --db {kraken2db}  --unclassified-out {outprefix}  --report {report} --paired --use-names {unmapped_1} {unmapped_2}

## Summarize kraken2 result
## Genus level data was used for classification 
python bin/summarize-kraken.py -i {report} -l {taxoLevel} -o {output}

5.1 Filtering

5.2 Differential analysis: