整理成输入格式 input.txt
SRR1228245_1.fastq.gz,SRR1228245_2.fastq.gz,SRR1228245_1.fq,SRR1228245_2.fq,SRR1228245_1.second.clean.fq.gz,SRR1228245_2.second.clean.fq.gzSRR1228246_1.fastq.gz,SRR1228246_2.fastq.gz,SRR1228246_1.fq,SRR1228246_2.fq,SRR1228246_1.second.clean.fq.gz,SRR1228246_2.second.clean.fq.gzSRR1228247_1.fastq.gz,SRR1228247_2.fastq.gz,SRR1228247_1.fq,SRR1228247_2.fq,SRR1228247_1.second.clean.fq.gz,SRR1228247_2.second.clean.fq.gzSRR1228252_1.fastq.gz,SRR1228252_2.fastq.gz,SRR1228252_1.fq,SRR1228252_2.fq,SRR1228252_1.second.clean.fq.gz,SRR1228252_2.second.clean.fq.gzSRR1228253_1.fastq.gz,SRR1228253_2.fastq.gz,SRR1228253_1.fq,SRR1228253_2.fq,SRR1228253_1.second.clean.fq.gz,SRR1228253_2.second.clean.fq.gzSRR1228261_1.fastq.gz,SRR1228261_2.fastq.gz,SRR1228261_1.fq,SRR1228261_2.fq,SRR1228261_1.second.clean.fq.gz,SRR1228261_2.second.clean.fq.gzSRR1228262_1.fastq.gz,SRR1228262_2.fastq.gz,SRR1228262_1.fq,SRR1228262_2.fq,SRR1228262_1.second.clean.fq.gz,SRR1228262_2.second.clean.fq.gzSRR1228263_1.fastq.gz,SRR1228263_2.fastq.gz,SRR1228263_1.fq,SRR1228263_2.fq,SRR1228263_1.second.clean.fq.gz,SRR1228263_2.second.clean.fq.gzSRR1228265_1.fastq.gz,SRR1228265_2.fastq.gz,SRR1228265_1.fq,SRR1228265_2.fq,SRR1228265_1.second.clean.fq.gz,SRR1228265_2.second.clean.fq.gzSRR1228250_1.fastq.gz,SRR1228250_2.fastq.gz,SRR1228250_1.fq,SRR1228250_2.fq,SRR1228250_1.second.clean.fq.gz,SRR1228250_2.second.clean.fq.gzSRR1228248_1.fastq.gz,SRR1228248_2.fastq.gz,SRR1228248_1.fq,SRR1228248_2.fq,SRR1228248_1.second.clean.fq.gz,SRR1228248_2.second.clean.fq.gzSRR1228249_1.fastq.gz,SRR1228249_2.fastq.gz,SRR1228249_1.fq,SRR1228249_2.fq,SRR1228249_1.second.clean.fq.gz,SRR1228249_2.second.clean.fq.gzSRR1228251_1.fastq.gz,SRR1228251_2.fastq.gz,SRR1228251_1.fq,SRR1228251_2.fq,SRR1228251_1.second.clean.fq.gz,SRR1228251_2.second.clean.fq.gzSRR1228254_1.fastq.gz,SRR1228254_2.fastq.gz,SRR1228254_1.fq,SRR1228254_2.fq,SRR1228254_1.second.clean.fq.gz,SRR1228254_2.second.clean.fq.gzSRR1228255_1.fastq.gz,SRR1228255_2.fastq.gz,SRR1228255_1.fq,SRR1228255_2.fq,SRR1228255_1.second.clean.fq.gz,SRR1228255_2.second.clean.fq.gzSRR1228256_1.fastq.gz,SRR1228256_2.fastq.gz,SRR1228256_1.fq,SRR1228256_2.fq,SRR1228256_1.second.clean.fq.gz,SRR1228256_2.second.clean.fq.gzSRR1228257_1.fastq.gz,SRR1228257_2.fastq.gz,SRR1228257_1.fq,SRR1228257_2.fq,SRR1228257_1.second.clean.fq.gz,SRR1228257_2.second.clean.fq.gzSRR1228258_1.fastq.gz,SRR1228258_2.fastq.gz,SRR1228258_1.fq,SRR1228258_2.fq,SRR1228258_1.second.clean.fq.gz,SRR1228258_2.second.clean.fq.gzSRR1228259_1.fastq.gz,SRR1228259_2.fastq.gz,SRR1228259_1.fq,SRR1228259_2.fq,SRR1228259_1.second.clean.fq.gz,SRR1228259_2.second.clean.fq.gzSRR1228260_1.fastq.gz,SRR1228260_2.fastq.gz,SRR1228260_1.fq,SRR1228260_2.fq,SRR1228260_1.second.clean.fq.gz,SRR1228260_2.second.clean.fq.gzSRR1228264_1.fastq.gz,SRR1228264_2.fastq.gz,SRR1228264_1.fq,SRR1228264_2.fq,SRR1228264_1.second.clean.fq.gz,SRR1228264_2.second.clean.fq.gz获得bam文件之后,使用featureCounts统计原始reads数,输入的文件是gtf格式的基因组注释文件。这个基因组的gtf文件生成过程是:使用gmap软件将基因mapping到基因组上,生成gff3格式的文件;将gff3格式的转换成gtf文件即可。转换的脚本见下一篇博文 gff3 to gtf.
featureCounts -T 20 -t exon -g transcript_id --readExtension5 70 --readExtension3 70 -p -O --donotsort -C -a /data2/masw_data/transcript/TGACv1.cdna.reformat.gtf -o TGACv1.rust.unique_count.txt SRR1228254.less_than_2_mismatch.bam SRR1228255.less_than_2_mismatch.bam SRR1228256.less_than_2_mismatch.bam SRR1228245.less_than_2_mismatch.bam SRR1228246.less_than_2_mismatch.bam SRR1228247.less_than_2_mismatch.bam SRR1228248.less_than_2_mismatch.bam SRR1228249.less_than_2_mismatch.bam SRR1228250.less_than_2_mismatch.bam SRR1228251.less_than_2_mismatch.bam SRR1228252.less_than_2_mismatch.bam SRR1228253.less_than_2_mismatch.bam SRR1228257.less_than_2_mismatch.bam SRR1228258.less_than_2_mismatch.bam SRR1228259.less_than_2_mismatch.bam SRR1228260.less_than_2_mismatch.bam SRR1228261.less_than_2_mismatch.bam SRR1228262.less_than_2_mismatch.bam SRR1228263.less_than_2_mismatch.bam SRR1228264.less_than_2_mismatch.bam SRR1228265.less_than_2_mismatch.bam有了原始的count,结合每个下面的脚本就可以算出每个基因的FPKM值。这个脚本已经将3个重复的平均值算出。以及重复间的标准差。得到每个sample的FPKM值,可以算出基因是否差异表达。计算p-value没有包括在这个脚本中。因为我只是想要看某一个基因的表达情况,所以调出这个基因之后,可以使用Excel简单的算下是否差异表达。
#!/usr/bin/env python# -*- coding: utf-8 -*-__author__ = 'shengwei ma'__author_email__ = 'shengweima@icloud.com'import numpy as npall_fpkm = open('rust_all_sample_fpkm.txt', 'w')all_fpkm.writelines("%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s" / "/t%s/t%s/t%s" % ('Geneid', 'Chr', 'Start', 'End', 'Strand', 'Length', 'new_Bgt_0dpi_repeatI', 'new_Bgt_0dpi_repeatII', 'new_Bgt_0dpi_repeatIII', 'new_Pst_24dpi_repeatI', 'new_Pst_24dpi_repeatII', 'new_Pst_24dpi_repeatIII', 'new_Pst_48dpi_repeatI', 'new_Pst_48dpi_repeatII', 'new_Pst_48dpi_repeatIII', 'new_Pst_72dpi_repeatI', 'new_Pst_72dpi_repeatII', 'new_Pst_72dpi_repeatIII', 'new_Bgt_24dpi_repeatI', 'new_Bgt_24dpi_repeatII', 'new_Bgt_24dpi_repeatIII', 'new_Bgt_48dpi_repeatI', 'new_Bgt_48dpi_repeatII', 'new_Bgt_48dpi_repeatIII', 'new_Bgt_72dpi_repeatI', 'new_Bgt_72dpi_repeatII', 'new_Bgt_72dpi_repeatIII'))raw_total = [('Bgt_0dpi_repeatI', 49168553), ('Bgt_0dpi_repeatII', 44047402), ('Bgt_0dpi_repeatIII', 78098556), ('Pst_24dpi_repeatI', 38474362), ('Pst_24dpi_repeatII', 79981030), ('Pst_24dpi_repeatIII', 41041508), ('Pst_48dpi_repeatI', 46935246), ('Pst_48dpi_repeatII', 38803969), ('Pst_48dpi_repeatIII', 51627704), ('Pst_72dpi_repeatI', 37219517), ('Pst_72dpi_repeatII', 39849949), ('Pst_72dpi_repeatIII', 40299574), ('Bgt_24dpi_repeatI', 38168988), ('Bgt_24dpi_repeatII', 43073693), ('Bgt_24dpi_repeatIII', 44071613), ('Bgt_48dpi_repeatI', 40380776), ('Bgt_48dpi_repeatII', 32810256), ('Bgt_48dpi_repeatIII', 35749803), ('Bgt_72dpi_repeatI', 46203474), ('Bgt_72dpi_repeatII', 43612313), ('Bgt_72dpi_repeatIII', 40406588), ] #数字表示每个sample的总mapping reads数目with open('MLJ_unique_expression.txt', 'r') as f: print "%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s" / "/t%s/t%s/t%s" % / ('Geneid', 'Chr', 'Start', 'End', 'Strand', 'Length', 'Bgt_0dpi', 'Pst_24dpi', 'Pst_48dpi', 'Pst_72dpi', 'Bgt_24dpi', 'Bgt_48dpi', 'Bgt_72dpi', 'Bgt_0dpi_std', 'Pst_24dpi_std', 'Pst_48dpi_std', 'Pst_72dpi_std', 'Bgt_24dpi_std', 'Bgt_48dpi_std', 'Bgt_72dpi_std') for line in f: if line.startswith('#') or line.startswith('Geneid'): pass else: new = line.strip().split('/t') (Geneid, Chr, Start, End, Strand, Length, Bgt_0dpi_repeatI, Bgt_0dpi_repeatII, Bgt_0dpi_repeatIII, Pst_24dpi_repeatI, Pst_24dpi_repeatII, Pst_24dpi_repeatIII, Pst_48dpi_repeatI, Pst_48dpi_repeatII, Pst_48dpi_repeatIII, Pst_72dpi_repeatI, Pst_72dpi_repeatII, Pst_72dpi_repeatIII, Bgt_24dpi_repeatI, Bgt_24dpi_repeatII, Bgt_24dpi_repeatIII, Bgt_48dpi_repeatI, Bgt_48dpi_repeatII, Bgt_48dpi_repeatIII, Bgt_72dpi_repeatI, Bgt_72dpi_repeatII, Bgt_72dpi_repeatIII) = new new_Bgt_0dpi_repeatI = int(Bgt_0dpi_repeatI) * pow(10.0, 9) / (int(Length) * int(raw_total[0][-1])) new_Bgt_0dpi_repeatII = int(Bgt_0dpi_repeatII) * pow(10.0, 9) / (int(Length) * int(raw_total[1][-1])) new_Bgt_0dpi_repeatIII = int(Bgt_0dpi_repeatIII) * pow(10.0, 9) / (int(Length) * int(raw_total[2][-1])) new_Pst_24dpi_repeatI = int(Pst_24dpi_repeatI) * pow(10.0, 9) / (int(Length) * int(raw_total[3][-1])) new_Pst_24dpi_repeatII = int(Pst_24dpi_repeatII) * pow(10.0, 9) / (int(Length) * int(raw_total[4][-1])) new_Pst_24dpi_repeatIII = int(Pst_24dpi_repeatIII) * pow(10.0, 9) / (int(Length) * int(raw_total[5][-1])) new_Pst_48dpi_repeatI = int(Pst_48dpi_repeatI) * pow(10.0, 9) / (int(Length) * int(raw_total[6][-1])) new_Pst_48dpi_repeatII = int(Pst_48dpi_repeatII) * pow(10.0, 9) / (int(Length) * int(raw_total[7][-1])) new_Pst_48dpi_repeatIII = int(Pst_48dpi_repeatIII) * pow(10.0, 9) / (int(Length) * int(raw_total[8][-1])) new_Pst_72dpi_repeatI = int(Pst_72dpi_repeatI) * pow(10.0, 9) / (int(Length) * int(raw_total[9][-1])) new_Pst_72dpi_repeatII = int(Pst_72dpi_repeatII) * pow(10.0, 9) / (int(Length) * int(raw_total[10][-1])) new_Pst_72dpi_repeatIII = int(Pst_72dpi_repeatIII) * pow(10.0, 9) / (int(Length) * int(raw_total[11][-1])) new_Bgt_24dpi_repeatI = int(Bgt_24dpi_repeatI) * pow(10.0, 9) / (int(Length) * int(raw_total[12][-1])) new_Bgt_24dpi_repeatII = int(Bgt_24dpi_repeatII) * pow(10.0, 9) / (int(Length) * int(raw_total[13][-1])) new_Bgt_24dpi_repeatIII = int(Bgt_24dpi_repeatIII) * pow(10.0, 9) / (int(Length) * int(raw_total[14][-1])) new_Bgt_48dpi_repeatI = int(Bgt_48dpi_repeatI) * pow(10.0, 9) / (int(Length) * int(raw_total[15][-1])) new_Bgt_48dpi_repeatII = int(Bgt_48dpi_repeatII) * pow(10.0, 9) / (int(Length) * int(raw_total[16][-1])) new_Bgt_48dpi_repeatIII = int(Bgt_48dpi_repeatIII) * pow(10.0 , 6) / (int(Length) * int(raw_total[17][-1])) new_Bgt_72dpi_repeatI = int(Bgt_72dpi_repeatI) * pow(10.0, 9) / (int(Length) * int(raw_total[18][-1])) new_Bgt_72dpi_repeatII = int(Bgt_72dpi_repeatII) * pow(10.0, 9) / (int(Length) * int(raw_total[19][-1])) new_Bgt_72dpi_repeatIII = int(Bgt_72dpi_repeatIII) * pow(10.0, 9) / (int(Length) * int(raw_total[20][-1])) Bgt_0dpi_mean = np.mean(np.array([new_Bgt_0dpi_repeatI, new_Bgt_0dpi_repeatII, new_Bgt_0dpi_repeatIII])) Bgt_0dpi_std = np.std(np.array([new_Bgt_0dpi_repeatI, new_Bgt_0dpi_repeatII, new_Bgt_0dpi_repeatIII])) Pst_24dpi_mean = np.mean(np.array([new_Pst_24dpi_repeatI, new_Pst_24dpi_repeatII, new_Pst_24dpi_repeatIII])) Pst_24dpi_std = np.std(np.array([new_Pst_24dpi_repeatI, new_Pst_24dpi_repeatII, new_Pst_24dpi_repeatIII])) Pst_48dpi_mean = np.mean(np.array([new_Pst_48dpi_repeatI, new_Pst_48dpi_repeatII, new_Pst_48dpi_repeatIII])) Pst_48dpi_std = np.std(np.array([new_Pst_48dpi_repeatI, new_Pst_48dpi_repeatII, new_Pst_48dpi_repeatIII])) Pst_72dpi_mean = np.mean(np.array([new_Pst_72dpi_repeatI, new_Pst_72dpi_repeatII, new_Pst_72dpi_repeatIII])) Pst_72dpi_std = np.std(np.array([new_Pst_72dpi_repeatI, new_Pst_72dpi_repeatII, new_Pst_72dpi_repeatIII])) Bgt_24dpi_mean = np.mean(np.array([new_Bgt_24dpi_repeatI, new_Bgt_24dpi_repeatII, new_Bgt_24dpi_repeatIII])) Bgt_24dpi_std = np.std(np.array([new_Bgt_24dpi_repeatI, new_Bgt_24dpi_repeatII, new_Bgt_24dpi_repeatIII])) Bgt_48dpi_mean = np.mean(np.array([new_Bgt_48dpi_repeatI, new_Bgt_48dpi_repeatII, new_Bgt_48dpi_repeatIII])) Bgt_48dpi_std = np.std(np.array([new_Bgt_48dpi_repeatI, new_Bgt_48dpi_repeatII, new_Bgt_48dpi_repeatIII])) Bgt_72dpi_mean = np.mean(np.array([new_Bgt_72dpi_repeatI, new_Bgt_72dpi_repeatII, new_Bgt_72dpi_repeatIII])) Bgt_72dpi_std = np.std(np.array([new_Bgt_72dpi_repeatI, new_Bgt_72dpi_repeatII, new_Bgt_72dpi_repeatIII])) print "%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s" / "/t%s/t%s/t%s" % / (Geneid, Chr, Start, End, Strand, Length, Bgt_0dpi_mean, Pst_24dpi_mean, Pst_48dpi_mean, Pst_72dpi_mean, Bgt_24dpi_mean, Bgt_48dpi_mean, Bgt_72dpi_mean, Bgt_0dpi_std, Pst_24dpi_std, Pst_48dpi_std, Pst_72dpi_std, Bgt_24dpi_std, Bgt_48dpi_std, Bgt_72dpi_std) all_fpkm.writelines( "%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s/t%s" / "/t%s/t%s/t%s" % (Geneid, Chr, Start, End, Strand, Length, new_Bgt_0dpi_repeatI, new_Bgt_0dpi_repeatII, new_Bgt_0dpi_repeatIII, new_Pst_24dpi_repeatI, new_Pst_24dpi_repeatII, new_Pst_24dpi_repeatIII, new_Pst_48dpi_repeatI, new_Pst_48dpi_repeatII, new_Pst_48dpi_repeatIII, new_Pst_72dpi_repeatI, new_Pst_72dpi_repeatII, new_Pst_72dpi_repeatIII, new_Bgt_24dpi_repeatI, new_Bgt_24dpi_repeatII, new_Bgt_24dpi_repeatIII, new_Bgt_48dpi_repeatI, new_Bgt_48dpi_repeatII, new_Bgt_48dpi_repeatIII, new_Bgt_72dpi_repeatI, new_Bgt_72dpi_repeatII, new_Bgt_72dpi_repeatIII))all_fpkm.close()新闻热点
疑难解答