Diff
checker
文本
文本
圖像
文檔
Excel
文件夾
Legal
Enterprise
桌面版
定價
登入
下載 Diffchecker 桌面版
比較文本
尋找兩個文字檔案之間的差異
工具
歷史
即時編輯器
摺疊未變更行
關閉換行
檢視
拆分
統一
比對精度
智能
單詞
字符
語法突出顯示
選擇語法
忽略
文字轉換
前往第一個差異
編輯輸入
Diffchecker Desktop
執行Diffchecker最安全的方式。取得Diffchecker桌面應用程式:您的差異永遠不會離開您的電腦!
取得桌面版
leafcutter_cluster.py VS leafcutter_cluster_regtools.py
建立於
6 年前
差異永不過期
清除
匯出
分享
解釋
36 刪除
行
總計
刪除
字符
總計
刪除
要繼續使用此功能,請升級到
Diff
checker
Pro
查看價格
489 行
全部複製
94 新增
行
總計
新增
字符
總計
新增
要繼續使用此功能,請升級到
Diff
checker
Pro
查看價格
534 行
全部複製
複製
已複製
複製
已複製
#!/usr/bin/env python
# https://github.com/griffithlab/regtools
# /home/yangili1/tools/regtools/build/regtools junctions extract -a 8 -i 50 -I 500000 bamfile.bam -o outfile.junc
# Using regtools speeds up the junction extraction step by an order of magnitude or more
import sys
import sys
import tempfile
import tempfile
import os
import os
import gzip
import gzip
import shutil
import shutil
def main(options,libl):
def main(options,libl):
複製
已複製
複製
已複製
pool_junc_reads(libl, options)
if options.cluster == None:
refine_clusters(options)
pool_junc_reads(libl, options)
refine_clusters(options)
sort_junctions(libl, options)
sort_junctions(libl, options)
merge_junctions(options)
merge_junctions(options)
get_numers(options)
get_numers(options)
def pool_junc_reads(flist, options):
def pool_junc_reads(flist, options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
maxIntronLen = int(options.maxintronlen)
maxIntronLen = int(options.maxintronlen)
checkchrom = options.checkchrom
checkchrom = options.checkchrom
useStrand = options.strand
useStrand = options.strand
outFile = "%s/%s_pooled"%(rundir,outPrefix)
outFile = "%s/%s_pooled"%(rundir,outPrefix)
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
by_chrom = {}
by_chrom = {}
for libl in flist:
for libl in flist:
lib = libl.strip()
lib = libl.strip()
if not os.path.isfile(lib):
if not os.path.isfile(lib):
continue
continue
if options.verbose:
if options.verbose:
sys.stderr.write("scanning %s...\n"%lib)
sys.stderr.write("scanning %s...\n"%lib)
複製
已複製
複製
已複製
if lib[-3:] == ".gz": F = gzip.open(lib)
for ln in
open(lib)
:
else: F = open(lib)
for ln in
F
:
lnsplit=ln.split()
lnsplit=ln.split()
if len(lnsplit)<6:
if len(lnsplit)<6:
sys.stderr.write("Error in %s \n" % lib)
sys.stderr.write("Error in %s \n" % lib)
continue
continue
複製
已複製
複製
已複製
chrom, A, B, dot, counts, strand
= lnsplit
chrom, A, B, dot, counts, strand
, rA,rb, rgb, blockCount, blockSize, blockStarts
= lnsplit
if int(blockCount) > 2:
print ln
continue
if not useStrand:
if not useStrand:
複製
已複製
複製
已複製
strand = "NA"
strand = "NA"
if checkchrom and (chrom not in chromLst): continue
if checkchrom and (chrom not in chromLst): continue
複製
已複製
複製
已複製
A, B = int(A)
, int(B)
+1
Aoff, Boff = blockSize.split(",")
A, B = int(A)
+int(Aoff)
, int(B)
-int(Boff)+1
if B-A > int(maxIntronLen): continue
if B-A > int(maxIntronLen): continue
複製
已複製
複製
已複製
try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,
strand)][(A,B)]
try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,
strand)][(A,B)]
except:
except:
try: by_chrom[(chrom,strand)][(A,B)] = int(counts)
try: by_chrom[(chrom,strand)][(A,B)] = int(counts)
複製
已複製
複製
已複製
except: by_chrom[(chrom,
strand)] = {(A,B):int(counts)}
except: by_chrom[(chrom,
strand)] = {(A,B):int(counts)}
fout = file(outFile, 'w')
fout = file(outFile, 'w')
Ncluster = 0
Ncluster = 0
sys.stderr.write("Parsing...\n")
sys.stderr.write("Parsing...\n")
for chrom in by_chrom:
for chrom in by_chrom:
read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads
read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads
read_ks.sort()
read_ks.sort()
複製
已複製
複製
已複製
sys.stderr.write("%s:%s.."%chrom)
sys.stderr.write("%s:%s.."%chrom)
clu = cluster_intervals(read_ks)[0]
clu = cluster_intervals(read_ks)[0]
for cl in clu:
for cl in clu:
複製
已複製
複製
已複製
if len(cl) >
1
: #
if cluster has more than one intron
if len(cl) >
0
: #
1
if cluster has more than one intron
buf = '%s:%s '%chrom
buf = '%s:%s '%chrom
for interval, count in [(x, by_chrom[chrom][x]) for x in cl]:
for interval, count in [(x, by_chrom[chrom][x]) for x in cl]:
buf += "%d:%d" % interval + ":%d"%count+ " "
buf += "%d:%d" % interval + ":%d"%count+ " "
fout.write(buf+'\n')
fout.write(buf+'\n')
複製
已複製
複製
已複製
Ncluster += 1
Ncluster += 1
sys.stderr.write("\nWrote %d clusters..\n"%Ncluster)
sys.stderr.write("\nWrote %d clusters..\n"%Ncluster)
fout.close()
fout.close()
def sort_junctions(libl, options):
def sort_junctions(libl, options):
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
複製
已複製
複製
已複製
refined_cluster = "%s/%s_refined"%(rundir,outPrefix)
checkchrom = options.checkchrom
checkchrom = options.checkchrom
useStrand = options.strand
useStrand = options.strand
複製
已複製
複製
已複製
if options.cluster == None:
refined_cluster = "%s/%s_refined"%(rundir,outPrefix)
else:
refined_cluster = options.cluster
runName = "%s/%s"%(rundir, outPrefix)
runName = "%s/%s"%(rundir, outPrefix)
exons, cluExons = {}, {}
exons, cluExons = {}, {}
cluN = 0
cluN = 0
for ln in open(refined_cluster):
for ln in open(refined_cluster):
chrom = ln.split()[0]
chrom = ln.split()[0]
複製
已複製
複製
已複製
chrom = tuple(chrom.split(":"))
cluN += 1
cluN += 1
for exon in ln.split()[1:]:
for exon in ln.split()[1:]:
A, B, count = exon.split(":")
A, B, count = exon.split(":")
if chrom not in exons:
if chrom not in exons:
exons[chrom] = {}
exons[chrom] = {}
複製
已複製
複製
已複製
exons[chrom][(int(A),int(B))] = cluN
exons[chrom][(int(A),int(B))] = cluN
if cluN not in cluExons:
if cluN not in cluExons:
cluExons[cluN] = []
cluExons[cluN] = []
cluExons[cluN].append((chrom, A, B))
cluExons[cluN].append((chrom, A, B))
merges = {}
merges = {}
for ll in libl:
for ll in libl:
lib=ll.rstrip()
lib=ll.rstrip()
if not os.path.isfile(lib):
if not os.path.isfile(lib):
continue
continue
libN = lib
libN = lib
if libN not in merges:
if libN not in merges:
merges[libN] = []
merges[libN] = []
merges[libN].append(lib)
merges[libN].append(lib)
fout_runlibs = file(runName+"_sortedlibs",'w')
fout_runlibs = file(runName+"_sortedlibs",'w')
for libN in merges:
for libN in merges:
libName = "%s/%s"%(rundir,libN.split('/')[-1])
libName = "%s/%s"%(rundir,libN.split('/')[-1])
by_chrom = {}
by_chrom = {}
foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1])
foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1])
fout_runlibs.write(foutName+'\n')
fout_runlibs.write(foutName+'\n')
if options.verbose:
if options.verbose:
sys.stderr.write("Sorting %s..\n"%libN)
sys.stderr.write("Sorting %s..\n"%libN)
if len(merges[libN]) > 1:
if len(merges[libN]) > 1:
if options.verbose:
if options.verbose:
sys.stderr.write("merging %s...\n"%(" ".join(merges[libN])))
sys.stderr.write("merging %s...\n"%(" ".join(merges[libN])))
else:
else:
pass
pass
fout = gzip.open(foutName,'w')
fout = gzip.open(foutName,'w')
fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0])
fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0])
for lib in merges[libN]:
for lib in merges[libN]:
複製
已複製
複製
已複製
if lib[-3:] == ".gz": F = gzip.open(lib)
else: F = open(lib)
for ln in
open(lib)
:
for ln in
F
:
lnsplit=ln.split()
lnsplit=ln.split()
if len(lnsplit)<6:
if len(lnsplit)<6:
sys.stderr.write("Error in %s \n" % lib)
sys.stderr.write("Error in %s \n" % lib)
continue
continue
複製
已複製
複製
已複製
chrom,
start, end
, dot, count, strand
= ln
.
split
()
chrom,
A, B
, dot, count, strand
, rA,rb, rgb, blockCount, blockSize, blockStarts
= ln
split
if int(blockCount) > 2:
print ln
continue
if not useStrand:
if not useStrand:
複製
已複製
複製
已複製
strand = "NA"
strand = "NA"
if checkchrom and (chrom not in chromLst): continue
if checkchrom and (chrom not in chromLst): continue
複製
已複製
複製
已複製
chrom = (chrom,
strand)
Aoff, Boff = blockSize.split(",")
A, B = int(A)+int(Aoff), int(B)-int(Boff)+1
chrom = (chrom,
strand)
if chrom not in by_chrom:
if chrom not in by_chrom:
by_chrom[chrom] = {}
by_chrom[chrom] = {}
複製
已複製
複製
已複製
intron = (
int(start), int(end)+1)
intron = (
A, B)
if intron in by_chrom[chrom]:
if intron in by_chrom[chrom]:
by_chrom[chrom][intron] += int(count)
by_chrom[chrom][intron] += int(count)
else:
else:
by_chrom[chrom][intron] = int(count)
by_chrom[chrom][intron] = int(count)
複製
已複製
複製
已複製
for clu in cluExons:
for clu in cluExons:
buf = []
buf = []
ks = cluExons[clu]
ks = cluExons[clu]
ks.sort()
ks.sort()
tot = 0
tot = 0
for exon in ks:
for exon in ks:
chrom, start, end = exon
chrom, start, end = exon
複製
已複製
複製
已複製
chrom = tuple(chrom.split(":"))
start, end = int(start), int(end)
start, end = int(start), int(end)
if chrom not in by_chrom:
if chrom not in by_chrom:
pass
pass
elif (start,end) in by_chrom[chrom]:
elif (start,end) in by_chrom[chrom]:
tot += by_chrom[chrom][(start,end)]
tot += by_chrom[chrom][(start,end)]
for exon in ks:
for exon in ks:
chrom, start, end = exon
chrom, start, end = exon
start, end = int(start), int(end)
start, end = int(start), int(end)
複製
已複製
複製
已複製
chrom = tuple(chrom.split(":"))
chromID, strand = chrom
chromID, strand = chrom
if chrom not in by_chrom:
if chrom not in by_chrom:
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot))
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot))
複製
已複製
複製
已複製
elif (start,end) in by_chrom[chrom]:
elif (start,end) in by_chrom[chrom]:
buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot))
buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot))
else:
else:
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot))
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot))
fout.write("".join(buf))
fout.write("".join(buf))
fout.close()
fout.close()
fout_runlibs.close()
fout_runlibs.close()
def refine_clusters(options):
def refine_clusters(options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
minratio = float(options.mincluratio)
minratio = float(options.mincluratio)
minclureads = int(options.minclureads)
minclureads = int(options.minclureads)
minreads = int(options.minreads)
minreads = int(options.minreads)
inFile = "%s/%s_pooled"%(rundir,outPrefix)
inFile = "%s/%s_pooled"%(rundir,outPrefix)
outFile = "%s/%s_refined"%(rundir,outPrefix)
outFile = "%s/%s_refined"%(rundir,outPrefix)
複製
已複製
複製
已複製
fout = file(outFile,'w')
fout = file(outFile,'w')
複製
已複製
複製
已複製
Ncl = 0
Ncl = 0
for ln in open(inFile):
for ln in open(inFile):
clu = []
clu = []
totN = 0
totN = 0
chrom = ln.split()[0]
chrom = ln.split()[0]
for ex in ln.split()[1:]:
for ex in ln.split()[1:]:
A, B, N = ex.split(":")
A, B, N = ex.split(":")
clu.append(((int(A),int(B)), int(N)))
clu.append(((int(A),int(B)), int(N)))
totN += int(N)
totN += int(N)
複製
已複製
複製
已複製
if totN < minclureads: continue
if totN < minclureads: continue
複製
已複製
複製
已複製
if options.const:
if len(clu) == 1:
buf = '%s ' % chrom
for interval, count in clu:
buf += "%d:%d" % interval + ":%d"%(count)+ " "
Ncl += 1
fout.write(buf+'\n')
for cl in refine_linked(clu):
for cl in refine_linked(clu):
rc = refine_cluster(cl,minratio, minreads)
rc = refine_cluster(cl,minratio, minreads)
複製
已複製
複製
已複製
if len(rc) > 0:
if len(rc) > 0:
for clu in rc:
for clu in rc:
buf = '%s ' % chrom
buf = '%s ' % chrom
for interval, count in clu:
for interval, count in clu:
buf += "%d:%d" % interval + ":%d"%(count)+ " "
buf += "%d:%d" % interval + ":%d"%(count)+ " "
Ncl += 1
Ncl += 1
fout.write(buf+'\n')
fout.write(buf+'\n')
複製
已複製
複製
已複製
sys.stderr.write("Split into %s clusters...\n"%Ncl)
sys.stderr.write("Split into %s clusters...\n"%Ncl)
fout.close()
fout.close()
def merge_junctions(options):
def merge_junctions(options):
''' function to merge junctions '''
''' function to merge junctions '''
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
複製
已複製
複製
已複製
fnameout = "%s/%s"%(rundir,outPrefix)
fnameout = "%s/%s"%(rundir,outPrefix)
複製
已複製
複製
已複製
flist = "%s/%s_sortedlibs"%(rundir, outPrefix)
flist = "%s/%s_sortedlibs"%(rundir, outPrefix)
lsts = []
lsts = []
for ln in open(flist):
for ln in open(flist):
lsts.append(ln.strip())
lsts.append(ln.strip())
if options.verbose:
if options.verbose:
sys.stderr.write("merging %d junction files...\n"%(len(lsts)))
sys.stderr.write("merging %d junction files...\n"%(len(lsts)))
# Change 300 if max open file is < 300
# Change 300 if max open file is < 300
N = min([300, max([100, int(len(lsts)**(0.5))])])
N = min([300, max([100, int(len(lsts)**(0.5))])])
tmpfiles = []
tmpfiles = []
while len(lsts) > 1:
while len(lsts) > 1:
clst = []
clst = []
for i in range(0,(len(lsts)/N)+1):
for i in range(0,(len(lsts)/N)+1):
lst = lsts[N*i:N*(i+1)]
lst = lsts[N*i:N*(i+1)]
if len(lst) > 0:
if len(lst) > 0:
clst.append(lst)
clst.append(lst)
lsts = []
lsts = []
for lst in clst:
for lst in clst:
if len(lst) == 0: continue
if len(lst) == 0: continue
tmpfile = tempfile.mktemp()
tmpfile = tempfile.mktemp()
os.mkdir(tmpfile)
os.mkdir(tmpfile)
foutname = tmpfile+"/tmpmerge.gz"
foutname = tmpfile+"/tmpmerge.gz"
fout = gzip.open(foutname,'w')
fout = gzip.open(foutname,'w')
merge_files(lst, fout, options)
merge_files(lst, fout, options)
lsts.append(foutname)
lsts.append(foutname)
tmpfiles.append(foutname)
tmpfiles.append(foutname)
fout.close()
fout.close()
複製
已複製
複製
已複製
shutil.move(lsts[0], fnameout+"_perind.counts.gz")
if not options.const:
shutil.move(lsts[0], fnameout+"_perind.counts.gz")
else:
shutil.move(lsts[0], fnameout+"_perind.constcounts.gz")
def merge_files(fnames, fout, options):
def merge_files(fnames, fout, options):
fopen = []
fopen = []
for fname in fnames:
for fname in fnames:
if fname[-3:] == ".gz":
if fname[-3:] == ".gz":
fopen.append(gzip.open(fname))
fopen.append(gzip.open(fname))
else:
else:
fopen.append(open(fname))
fopen.append(open(fname))
finished = False
finished = False
N = 0
N = 0
while not finished:
while not finished:
N += 1
N += 1
if N % 50000 == 0:
if N % 50000 == 0:
sys.stderr.write(".")
sys.stderr.write(".")
buf = []
buf = []
for f in fopen:
for f in fopen:
ln = f.readline().split()
ln = f.readline().split()
if len(ln) == 0:
if len(ln) == 0:
finished = True
finished = True
break
break
chrom = ln[0]
chrom = ln[0]
data = ln[1:]
data = ln[1:]
if len(buf) == 0:
if len(buf) == 0:
buf.append(chrom)
buf.append(chrom)
buf += data
buf += data
if len(buf) > 0:
if len(buf) > 0:
if buf[0] == "chrom":
if buf[0] == "chrom":
if options.verbose:
if options.verbose:
sys.stderr.write("merging %d files"%(len(buf)-1))
sys.stderr.write("merging %d files"%(len(buf)-1))
fout.write(" ".join(buf)+'\n')
fout.write(" ".join(buf)+'\n')
else:
else:
break
break
sys.stderr.write(" done.\n")
sys.stderr.write(" done.\n")
for fin in fopen:
for fin in fopen:
fin.close()
fin.close()
def cluster_intervals(E):
def cluster_intervals(E):
''' Clusters intervals together. '''
''' Clusters intervals together. '''
E.sort()
E.sort()
複製
已複製
複製
已複製
if len(E) == 0:
return [], []
current = E[0]
current = E[0]
Eclusters, cluster = [], []
Eclusters, cluster = [], []
i = 0
i = 0
while i < len(E):
while i < len(E):
if overlaps(E[i], current):
if overlaps(E[i], current):
cluster.append(E[i])
cluster.append(E[i])
else:
else:
Eclusters.append(cluster)
Eclusters.append(cluster)
cluster = [E[i]]
cluster = [E[i]]
current = (E[i][0], max([current[1], E[i][1]]))
current = (E[i][0], max([current[1], E[i][1]]))
i += 1
i += 1
if len(cluster) > 0:
if len(cluster) > 0:
Eclusters.append(cluster)
Eclusters.append(cluster)
return Eclusters, E
return Eclusters, E
def overlaps(A,B):
def overlaps(A,B):
'''
'''
Checks if A and B overlaps
Checks if A and B overlaps
'''
'''
if A[1] < B[0] or B[1] < A[0]:
if A[1] < B[0] or B[1] < A[0]:
return False
return False
else: return True
else: return True
def refine_linked(clusters):
def refine_linked(clusters):
unassigned = [x for x in clusters[1:]]
unassigned = [x for x in clusters[1:]]
current = [clusters[0]]
current = [clusters[0]]
splicesites = set([current[0][0][0],current[0][0][1]])
splicesites = set([current[0][0][0],current[0][0][1]])
newClusters = []
newClusters = []
while len(unassigned) > 0:
while len(unassigned) > 0:
finished = False
finished = False
while not finished:
while not finished:
finished = True
finished = True
torm = []
torm = []
for intron in unassigned:
for intron in unassigned:
inter, count = intron
inter, count = intron
start, end = inter
start, end = inter
if start in splicesites or end in splicesites:
if start in splicesites or end in splicesites:
current.append(intron)
current.append(intron)
splicesites.add(start)
splicesites.add(start)
splicesites.add(end)
splicesites.add(end)
finished = False
finished = False
torm.append(intron)
torm.append(intron)
for intron in torm:
for intron in torm:
unassigned.remove(intron)
unassigned.remove(intron)
newClusters.append(current)
newClusters.append(current)
current = []
current = []
if len(unassigned) > 0:
if len(unassigned) > 0:
current = [unassigned[0]]
current = [unassigned[0]]
splicesites = set([current[0][0][0],current[0][0][1]])
splicesites = set([current[0][0][0],current[0][0][1]])
unassigned = unassigned[1:]
unassigned = unassigned[1:]
return newClusters
return newClusters
def refine_cluster(clu, cutoff, readcutoff):
def refine_cluster(clu, cutoff, readcutoff):
''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff,
''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff,
remove and recluster '''
remove and recluster '''
remove = []
remove = []
dic = {}
dic = {}
intervals = []
intervals = []
reCLU = False
reCLU = False
totN = 0
totN = 0
for inter, count in clu:
for inter, count in clu:
totN += count
totN += count
for inter, count in clu:
for inter, count in clu:
if (count/float(totN) >= cutoff and count >= readcutoff):
if (count/float(totN) >= cutoff and count >= readcutoff):
intervals.append(inter)
intervals.append(inter)
dic[inter] = count
dic[inter] = count
else:
else:
reCLU = True
reCLU = True
if len(intervals) == 0: return []
if len(intervals) == 0: return []
# This makes sure that after trimming, the clusters are still good
# This makes sure that after trimming, the clusters are still good
Atmp, B = cluster_intervals(intervals)
Atmp, B = cluster_intervals(intervals)
A = []
A = []
for cl in Atmp:
for cl in Atmp:
for c in refine_linked([(x,0) for x in cl]):
for c in refine_linked([(x,0) for x in cl]):
if len(c) > 0:
if len(c) > 0:
A.append([x[0] for x in c])
A.append([x[0] for x in c])
if len(A) == 1:
if len(A) == 1:
rc = [(x, dic[x]) for x in A[0]]
rc = [(x, dic[x]) for x in A[0]]
if len(rc) > 1:
if len(rc) > 1:
if reCLU:
if reCLU:
return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff)
return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff)
else:
else:
return [[(x, dic[x]) for x in A[0]]]
return [[(x, dic[x]) for x in A[0]]]
else:
else:
return []
return []
NCs = []
NCs = []
for c in A:
for c in A:
if len(c) > 1:
if len(c) > 1:
NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff)
NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff)
NCs += NC
NCs += NC
複製
已複製
複製
已複製
return NCs
return NCs
def get_numers(options):
def get_numers(options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
複製
已複製
複製
已複製
fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix)
if not options.const:
fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix)
else:
fname = "%s/%s_perind.constcounts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.constcounts.gz"%(rundir,outPrefix)
input_file=gzip.open(fname, 'rb')
input_file=gzip.open(fname, 'rb')
fout = gzip.open(fnameout,'w')
fout = gzip.open(fnameout,'w')
first_line=True
first_line=True
for l in input_file:
for l in input_file:
if first_line:
if first_line:
fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names
fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names
first_line=False
first_line=False
else:
else:
l=l.strip()
l=l.strip()
words=l.split(" ")
words=l.split(" ")
fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n')
fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n')
input_file.close()
input_file.close()
fout.close()
fout.close()
if __name__ == "__main__":
if __name__ == "__main__":
from optparse import OptionParser
from optparse import OptionParser
parser = OptionParser()
parser = OptionParser()
parser.add_option("-j", "--juncfiles", dest="juncfiles",
parser.add_option("-j", "--juncfiles", dest="juncfiles",
help="text file with all junction files to be processed")
help="text file with all junction files to be processed")
parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter',
parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter',
help="output prefix (default leafcutter)")
help="output prefix (default leafcutter)")
parser.add_option("-q", "--quiet",
parser.add_option("-q", "--quiet",
action="store_false", dest="verbose", default=True,
action="store_false", dest="verbose", default=True,
help="don't print status messages to stdout")
help="don't print status messages to stdout")
parser.add_option("-r", "--rundir", dest="rundir", default='./',
parser.add_option("-r", "--rundir", dest="rundir", default='./',
help="write to directory (default ./)")
help="write to directory (default ./)")
parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000,
parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000,
help="maximum intron length in bp (default 100,000bp)")
help="maximum intron length in bp (default 100,000bp)")
parser.add_option("-m", "--minclureads", dest="minclureads", default = 30,
parser.add_option("-m", "--minclureads", dest="minclureads", default = 30,
help="minimum reads in a cluster (default 30 reads)")
help="minimum reads in a cluster (default 30 reads)")
parser.add_option("-M", "--minreads", dest="minreads", default = 5,
parser.add_option("-M", "--minreads", dest="minreads", default = 5,
複製
已複製
複製
已複製
help="minimum reads
for an intron
(default 5 reads)")
help="minimum reads
in a cluster
(default 5 reads)")
parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001,
parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001,
help="minimum fraction of reads in a cluster that support a junction (default 0.001)")
help="minimum fraction of reads in a cluster that support a junction (default 0.001)")
複製
已複製
複製
已複製
parser.add_option("-k", "--checkchrom", dest="checkchrom", default = False,
help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ... (default=False)")
複製
已複製
複製
已複製
parser.add_option("-
s
", "--
strand
", dest="
strand",
default = False,
parser.add_option("-
c
", "--
cluster
", dest="
cluster", default = None,
help="
use strand info (
default
=
False
)
")
help="refined cluster file when clusters are already made")
parser.add_option("-k", "--checkchrom", dest="checkchrom", action="store_true",
default = False,
help="
check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ...")
parser.add_option("-C", "--includeconst", dest="const", action="store_true",
default
=
False
,
help="also include constitutive introns
")
複製
已複製
複製
已複製
parser.add_option("-s", "--strand", dest="strand", default = True,
help="use strand info (default=True)")
(options, args) = parser.parse_args()
(options, args) = parser.parse_args()
if options.juncfiles == None:
if options.juncfiles == None:
sys.stderr.write("Error: no junction file provided...\n")
sys.stderr.write("Error: no junction file provided...\n")
exit(0)
exit(0)
# Get the junction file list
# Get the junction file list
libl = []
libl = []
for junc in open(options.juncfiles):
for junc in open(options.juncfiles):
junc = junc.strip()
junc = junc.strip()
try:
try:
open(junc)
open(junc)
except:
except:
sys.stderr.write("%s does not exist... check your junction files.\n"%junc)
sys.stderr.write("%s does not exist... check your junction files.\n"%junc)
exit(0)
exit(0)
libl.append(junc)
libl.append(junc)
main(options, libl)
main(options, libl)
已保存差異
原始文本
開啟檔案
#!/usr/bin/env python import sys import tempfile import os import gzip import shutil def main(options,libl): pool_junc_reads(libl, options) refine_clusters(options) sort_junctions(libl, options) merge_junctions(options) get_numers(options) def pool_junc_reads(flist, options): outPrefix = options.outprefix rundir = options.rundir maxIntronLen = int(options.maxintronlen) checkchrom = options.checkchrom useStrand = options.strand outFile = "%s/%s_pooled"%(rundir,outPrefix) chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] by_chrom = {} for libl in flist: lib = libl.strip() if not os.path.isfile(lib): continue if options.verbose: sys.stderr.write("scanning %s...\n"%lib) if lib[-3:] == ".gz": F = gzip.open(lib) else: F = open(lib) for ln in F: lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, counts, strand = lnsplit if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue A, B = int(A), int(B)+1 if B-A > int(maxIntronLen): continue try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom, strand)][(A,B)] except: try: by_chrom[(chrom,strand)][(A,B)] = int(counts) except: by_chrom[(chrom, strand)] = {(A,B):int(counts)} fout = file(outFile, 'w') Ncluster = 0 sys.stderr.write("Parsing...\n") for chrom in by_chrom: read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads read_ks.sort() sys.stderr.write("%s:%s.."%chrom) clu = cluster_intervals(read_ks)[0] for cl in clu: if len(cl) > 1: # if cluster has more than one intron buf = '%s:%s '%chrom for interval, count in [(x, by_chrom[chrom][x]) for x in cl]: buf += "%d:%d" % interval + ":%d"%count+ " " fout.write(buf+'\n') Ncluster += 1 sys.stderr.write("\nWrote %d clusters..\n"%Ncluster) fout.close() def sort_junctions(libl, options): chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] outPrefix = options.outprefix rundir = options.rundir refined_cluster = "%s/%s_refined"%(rundir,outPrefix) checkchrom = options.checkchrom useStrand = options.strand runName = "%s/%s"%(rundir, outPrefix) exons, cluExons = {}, {} cluN = 0 for ln in open(refined_cluster): chrom = ln.split()[0] chrom = tuple(chrom.split(":")) cluN += 1 for exon in ln.split()[1:]: A, B, count = exon.split(":") if chrom not in exons: exons[chrom] = {} exons[chrom][(int(A),int(B))] = cluN if cluN not in cluExons: cluExons[cluN] = [] cluExons[cluN].append((chrom, A, B)) merges = {} for ll in libl: lib=ll.rstrip() if not os.path.isfile(lib): continue libN = lib if libN not in merges: merges[libN] = [] merges[libN].append(lib) fout_runlibs = file(runName+"_sortedlibs",'w') for libN in merges: libName = "%s/%s"%(rundir,libN.split('/')[-1]) by_chrom = {} foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1]) fout_runlibs.write(foutName+'\n') if options.verbose: sys.stderr.write("Sorting %s..\n"%libN) if len(merges[libN]) > 1: if options.verbose: sys.stderr.write("merging %s...\n"%(" ".join(merges[libN]))) else: pass fout = gzip.open(foutName,'w') fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0]) for lib in merges[libN]: if lib[-3:] == ".gz": F = gzip.open(lib) else: F = open(lib) for ln in F: lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, start, end, dot, count, strand = ln.split() if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue chrom = (chrom, strand) if chrom not in by_chrom: by_chrom[chrom] = {} intron = (int(start), int(end)+1) if intron in by_chrom[chrom]: by_chrom[chrom][intron] += int(count) else: by_chrom[chrom][intron] = int(count) for clu in cluExons: buf = [] ks = cluExons[clu] ks.sort() tot = 0 for exon in ks: chrom, start, end = exon start, end = int(start), int(end) if chrom not in by_chrom: pass elif (start,end) in by_chrom[chrom]: tot += by_chrom[chrom][(start,end)] for exon in ks: chrom, start, end = exon start, end = int(start), int(end) chromID, strand = chrom if chrom not in by_chrom: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot)) elif (start,end) in by_chrom[chrom]: buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot)) else: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot)) fout.write("".join(buf)) fout.close() fout_runlibs.close() def refine_clusters(options): outPrefix = options.outprefix rundir = options.rundir minratio = float(options.mincluratio) minclureads = int(options.minclureads) minreads = int(options.minreads) inFile = "%s/%s_pooled"%(rundir,outPrefix) outFile = "%s/%s_refined"%(rundir,outPrefix) fout = file(outFile,'w') Ncl = 0 for ln in open(inFile): clu = [] totN = 0 chrom = ln.split()[0] for ex in ln.split()[1:]: A, B, N = ex.split(":") clu.append(((int(A),int(B)), int(N))) totN += int(N) if totN < minclureads: continue for cl in refine_linked(clu): rc = refine_cluster(cl,minratio, minreads) if len(rc) > 0: for clu in rc: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') sys.stderr.write("Split into %s clusters...\n"%Ncl) fout.close() def merge_junctions(options): ''' function to merge junctions ''' outPrefix = options.outprefix rundir = options.rundir fnameout = "%s/%s"%(rundir,outPrefix) flist = "%s/%s_sortedlibs"%(rundir, outPrefix) lsts = [] for ln in open(flist): lsts.append(ln.strip()) if options.verbose: sys.stderr.write("merging %d junction files...\n"%(len(lsts))) # Change 300 if max open file is < 300 N = min([300, max([100, int(len(lsts)**(0.5))])]) tmpfiles = [] while len(lsts) > 1: clst = [] for i in range(0,(len(lsts)/N)+1): lst = lsts[N*i:N*(i+1)] if len(lst) > 0: clst.append(lst) lsts = [] for lst in clst: if len(lst) == 0: continue tmpfile = tempfile.mktemp() os.mkdir(tmpfile) foutname = tmpfile+"/tmpmerge.gz" fout = gzip.open(foutname,'w') merge_files(lst, fout, options) lsts.append(foutname) tmpfiles.append(foutname) fout.close() shutil.move(lsts[0], fnameout+"_perind.counts.gz") def merge_files(fnames, fout, options): fopen = [] for fname in fnames: if fname[-3:] == ".gz": fopen.append(gzip.open(fname)) else: fopen.append(open(fname)) finished = False N = 0 while not finished: N += 1 if N % 50000 == 0: sys.stderr.write(".") buf = [] for f in fopen: ln = f.readline().split() if len(ln) == 0: finished = True break chrom = ln[0] data = ln[1:] if len(buf) == 0: buf.append(chrom) buf += data if len(buf) > 0: if buf[0] == "chrom": if options.verbose: sys.stderr.write("merging %d files"%(len(buf)-1)) fout.write(" ".join(buf)+'\n') else: break sys.stderr.write(" done.\n") for fin in fopen: fin.close() def cluster_intervals(E): ''' Clusters intervals together. ''' E.sort() if len(E) == 0: return [], [] current = E[0] Eclusters, cluster = [], [] i = 0 while i < len(E): if overlaps(E[i], current): cluster.append(E[i]) else: Eclusters.append(cluster) cluster = [E[i]] current = (E[i][0], max([current[1], E[i][1]])) i += 1 if len(cluster) > 0: Eclusters.append(cluster) return Eclusters, E def overlaps(A,B): ''' Checks if A and B overlaps ''' if A[1] < B[0] or B[1] < A[0]: return False else: return True def refine_linked(clusters): unassigned = [x for x in clusters[1:]] current = [clusters[0]] splicesites = set([current[0][0][0],current[0][0][1]]) newClusters = [] while len(unassigned) > 0: finished = False while not finished: finished = True torm = [] for intron in unassigned: inter, count = intron start, end = inter if start in splicesites or end in splicesites: current.append(intron) splicesites.add(start) splicesites.add(end) finished = False torm.append(intron) for intron in torm: unassigned.remove(intron) newClusters.append(current) current = [] if len(unassigned) > 0: current = [unassigned[0]] splicesites = set([current[0][0][0],current[0][0][1]]) unassigned = unassigned[1:] return newClusters def refine_cluster(clu, cutoff, readcutoff): ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff, remove and recluster ''' remove = [] dic = {} intervals = [] reCLU = False totN = 0 for inter, count in clu: totN += count for inter, count in clu: if (count/float(totN) >= cutoff and count >= readcutoff): intervals.append(inter) dic[inter] = count else: reCLU = True if len(intervals) == 0: return [] # This makes sure that after trimming, the clusters are still good Atmp, B = cluster_intervals(intervals) A = [] for cl in Atmp: for c in refine_linked([(x,0) for x in cl]): if len(c) > 0: A.append([x[0] for x in c]) if len(A) == 1: rc = [(x, dic[x]) for x in A[0]] if len(rc) > 1: if reCLU: return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff) else: return [[(x, dic[x]) for x in A[0]]] else: return [] NCs = [] for c in A: if len(c) > 1: NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff) NCs += NC return NCs def get_numers(options): outPrefix = options.outprefix rundir = options.rundir fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix) input_file=gzip.open(fname, 'rb') fout = gzip.open(fnameout,'w') first_line=True for l in input_file: if first_line: fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names first_line=False else: l=l.strip() words=l.split(" ") fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n') input_file.close() fout.close() if __name__ == "__main__": from optparse import OptionParser parser = OptionParser() parser.add_option("-j", "--juncfiles", dest="juncfiles", help="text file with all junction files to be processed") parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter', help="output prefix (default leafcutter)") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") parser.add_option("-r", "--rundir", dest="rundir", default='./', help="write to directory (default ./)") parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, help="maximum intron length in bp (default 100,000bp)") parser.add_option("-m", "--minclureads", dest="minclureads", default = 30, help="minimum reads in a cluster (default 30 reads)") parser.add_option("-M", "--minreads", dest="minreads", default = 5, help="minimum reads for an intron (default 5 reads)") parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001, help="minimum fraction of reads in a cluster that support a junction (default 0.001)") parser.add_option("-k", "--checkchrom", dest="checkchrom", default = False, help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ... (default=False)") parser.add_option("-s", "--strand", dest="strand", default = False, help="use strand info (default=False)") (options, args) = parser.parse_args() if options.juncfiles == None: sys.stderr.write("Error: no junction file provided...\n") exit(0) # Get the junction file list libl = [] for junc in open(options.juncfiles): junc = junc.strip() try: open(junc) except: sys.stderr.write("%s does not exist... check your junction files.\n"%junc) exit(0) libl.append(junc) main(options, libl)
更改後文本
開啟檔案
# https://github.com/griffithlab/regtools # /home/yangili1/tools/regtools/build/regtools junctions extract -a 8 -i 50 -I 500000 bamfile.bam -o outfile.junc # Using regtools speeds up the junction extraction step by an order of magnitude or more import sys import tempfile import os import gzip import shutil def main(options,libl): if options.cluster == None: pool_junc_reads(libl, options) refine_clusters(options) sort_junctions(libl, options) merge_junctions(options) get_numers(options) def pool_junc_reads(flist, options): outPrefix = options.outprefix rundir = options.rundir maxIntronLen = int(options.maxintronlen) checkchrom = options.checkchrom useStrand = options.strand outFile = "%s/%s_pooled"%(rundir,outPrefix) chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] by_chrom = {} for libl in flist: lib = libl.strip() if not os.path.isfile(lib): continue if options.verbose: sys.stderr.write("scanning %s...\n"%lib) for ln in open(lib): lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, counts, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit if int(blockCount) > 2: print ln continue if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue Aoff, Boff = blockSize.split(",") A, B = int(A)+int(Aoff), int(B)-int(Boff)+1 if B-A > int(maxIntronLen): continue try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,strand)][(A,B)] except: try: by_chrom[(chrom,strand)][(A,B)] = int(counts) except: by_chrom[(chrom,strand)] = {(A,B):int(counts)} fout = file(outFile, 'w') Ncluster = 0 sys.stderr.write("Parsing...\n") for chrom in by_chrom: read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads read_ks.sort() sys.stderr.write("%s:%s.."%chrom) clu = cluster_intervals(read_ks)[0] for cl in clu: if len(cl) > 0: # 1 if cluster has more than one intron buf = '%s:%s '%chrom for interval, count in [(x, by_chrom[chrom][x]) for x in cl]: buf += "%d:%d" % interval + ":%d"%count+ " " fout.write(buf+'\n') Ncluster += 1 sys.stderr.write("\nWrote %d clusters..\n"%Ncluster) fout.close() def sort_junctions(libl, options): chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] outPrefix = options.outprefix rundir = options.rundir checkchrom = options.checkchrom useStrand = options.strand if options.cluster == None: refined_cluster = "%s/%s_refined"%(rundir,outPrefix) else: refined_cluster = options.cluster runName = "%s/%s"%(rundir, outPrefix) exons, cluExons = {}, {} cluN = 0 for ln in open(refined_cluster): chrom = ln.split()[0] cluN += 1 for exon in ln.split()[1:]: A, B, count = exon.split(":") if chrom not in exons: exons[chrom] = {} exons[chrom][(int(A),int(B))] = cluN if cluN not in cluExons: cluExons[cluN] = [] cluExons[cluN].append((chrom, A, B)) merges = {} for ll in libl: lib=ll.rstrip() if not os.path.isfile(lib): continue libN = lib if libN not in merges: merges[libN] = [] merges[libN].append(lib) fout_runlibs = file(runName+"_sortedlibs",'w') for libN in merges: libName = "%s/%s"%(rundir,libN.split('/')[-1]) by_chrom = {} foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1]) fout_runlibs.write(foutName+'\n') if options.verbose: sys.stderr.write("Sorting %s..\n"%libN) if len(merges[libN]) > 1: if options.verbose: sys.stderr.write("merging %s...\n"%(" ".join(merges[libN]))) else: pass fout = gzip.open(foutName,'w') fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0]) for lib in merges[libN]: for ln in open(lib): lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, count, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit if int(blockCount) > 2: print ln continue if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue Aoff, Boff = blockSize.split(",") A, B = int(A)+int(Aoff), int(B)-int(Boff)+1 chrom = (chrom,strand) if chrom not in by_chrom: by_chrom[chrom] = {} intron = (A, B) if intron in by_chrom[chrom]: by_chrom[chrom][intron] += int(count) else: by_chrom[chrom][intron] = int(count) for clu in cluExons: buf = [] ks = cluExons[clu] ks.sort() tot = 0 for exon in ks: chrom, start, end = exon chrom = tuple(chrom.split(":")) start, end = int(start), int(end) if chrom not in by_chrom: pass elif (start,end) in by_chrom[chrom]: tot += by_chrom[chrom][(start,end)] for exon in ks: chrom, start, end = exon start, end = int(start), int(end) chrom = tuple(chrom.split(":")) chromID, strand = chrom if chrom not in by_chrom: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot)) elif (start,end) in by_chrom[chrom]: buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot)) else: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot)) fout.write("".join(buf)) fout.close() fout_runlibs.close() def refine_clusters(options): outPrefix = options.outprefix rundir = options.rundir minratio = float(options.mincluratio) minclureads = int(options.minclureads) minreads = int(options.minreads) inFile = "%s/%s_pooled"%(rundir,outPrefix) outFile = "%s/%s_refined"%(rundir,outPrefix) fout = file(outFile,'w') Ncl = 0 for ln in open(inFile): clu = [] totN = 0 chrom = ln.split()[0] for ex in ln.split()[1:]: A, B, N = ex.split(":") clu.append(((int(A),int(B)), int(N))) totN += int(N) if totN < minclureads: continue if options.const: if len(clu) == 1: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') for cl in refine_linked(clu): rc = refine_cluster(cl,minratio, minreads) if len(rc) > 0: for clu in rc: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') sys.stderr.write("Split into %s clusters...\n"%Ncl) fout.close() def merge_junctions(options): ''' function to merge junctions ''' outPrefix = options.outprefix rundir = options.rundir fnameout = "%s/%s"%(rundir,outPrefix) flist = "%s/%s_sortedlibs"%(rundir, outPrefix) lsts = [] for ln in open(flist): lsts.append(ln.strip()) if options.verbose: sys.stderr.write("merging %d junction files...\n"%(len(lsts))) # Change 300 if max open file is < 300 N = min([300, max([100, int(len(lsts)**(0.5))])]) tmpfiles = [] while len(lsts) > 1: clst = [] for i in range(0,(len(lsts)/N)+1): lst = lsts[N*i:N*(i+1)] if len(lst) > 0: clst.append(lst) lsts = [] for lst in clst: if len(lst) == 0: continue tmpfile = tempfile.mktemp() os.mkdir(tmpfile) foutname = tmpfile+"/tmpmerge.gz" fout = gzip.open(foutname,'w') merge_files(lst, fout, options) lsts.append(foutname) tmpfiles.append(foutname) fout.close() if not options.const: shutil.move(lsts[0], fnameout+"_perind.counts.gz") else: shutil.move(lsts[0], fnameout+"_perind.constcounts.gz") def merge_files(fnames, fout, options): fopen = [] for fname in fnames: if fname[-3:] == ".gz": fopen.append(gzip.open(fname)) else: fopen.append(open(fname)) finished = False N = 0 while not finished: N += 1 if N % 50000 == 0: sys.stderr.write(".") buf = [] for f in fopen: ln = f.readline().split() if len(ln) == 0: finished = True break chrom = ln[0] data = ln[1:] if len(buf) == 0: buf.append(chrom) buf += data if len(buf) > 0: if buf[0] == "chrom": if options.verbose: sys.stderr.write("merging %d files"%(len(buf)-1)) fout.write(" ".join(buf)+'\n') else: break sys.stderr.write(" done.\n") for fin in fopen: fin.close() def cluster_intervals(E): ''' Clusters intervals together. ''' E.sort() current = E[0] Eclusters, cluster = [], [] i = 0 while i < len(E): if overlaps(E[i], current): cluster.append(E[i]) else: Eclusters.append(cluster) cluster = [E[i]] current = (E[i][0], max([current[1], E[i][1]])) i += 1 if len(cluster) > 0: Eclusters.append(cluster) return Eclusters, E def overlaps(A,B): ''' Checks if A and B overlaps ''' if A[1] < B[0] or B[1] < A[0]: return False else: return True def refine_linked(clusters): unassigned = [x for x in clusters[1:]] current = [clusters[0]] splicesites = set([current[0][0][0],current[0][0][1]]) newClusters = [] while len(unassigned) > 0: finished = False while not finished: finished = True torm = [] for intron in unassigned: inter, count = intron start, end = inter if start in splicesites or end in splicesites: current.append(intron) splicesites.add(start) splicesites.add(end) finished = False torm.append(intron) for intron in torm: unassigned.remove(intron) newClusters.append(current) current = [] if len(unassigned) > 0: current = [unassigned[0]] splicesites = set([current[0][0][0],current[0][0][1]]) unassigned = unassigned[1:] return newClusters def refine_cluster(clu, cutoff, readcutoff): ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff, remove and recluster ''' remove = [] dic = {} intervals = [] reCLU = False totN = 0 for inter, count in clu: totN += count for inter, count in clu: if (count/float(totN) >= cutoff and count >= readcutoff): intervals.append(inter) dic[inter] = count else: reCLU = True if len(intervals) == 0: return [] # This makes sure that after trimming, the clusters are still good Atmp, B = cluster_intervals(intervals) A = [] for cl in Atmp: for c in refine_linked([(x,0) for x in cl]): if len(c) > 0: A.append([x[0] for x in c]) if len(A) == 1: rc = [(x, dic[x]) for x in A[0]] if len(rc) > 1: if reCLU: return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff) else: return [[(x, dic[x]) for x in A[0]]] else: return [] NCs = [] for c in A: if len(c) > 1: NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff) NCs += NC return NCs def get_numers(options): outPrefix = options.outprefix rundir = options.rundir if not options.const: fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix) else: fname = "%s/%s_perind.constcounts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.constcounts.gz"%(rundir,outPrefix) input_file=gzip.open(fname, 'rb') fout = gzip.open(fnameout,'w') first_line=True for l in input_file: if first_line: fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names first_line=False else: l=l.strip() words=l.split(" ") fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n') input_file.close() fout.close() if __name__ == "__main__": from optparse import OptionParser parser = OptionParser() parser.add_option("-j", "--juncfiles", dest="juncfiles", help="text file with all junction files to be processed") parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter', help="output prefix (default leafcutter)") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") parser.add_option("-r", "--rundir", dest="rundir", default='./', help="write to directory (default ./)") parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, help="maximum intron length in bp (default 100,000bp)") parser.add_option("-m", "--minclureads", dest="minclureads", default = 30, help="minimum reads in a cluster (default 30 reads)") parser.add_option("-M", "--minreads", dest="minreads", default = 5, help="minimum reads in a cluster (default 5 reads)") parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001, help="minimum fraction of reads in a cluster that support a junction (default 0.001)") parser.add_option("-c", "--cluster", dest="cluster", default = None, help="refined cluster file when clusters are already made") parser.add_option("-k", "--checkchrom", dest="checkchrom", action="store_true",default = False, help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ...") parser.add_option("-C", "--includeconst", dest="const", action="store_true",default = False, help="also include constitutive introns") parser.add_option("-s", "--strand", dest="strand", default = True, help="use strand info (default=True)") (options, args) = parser.parse_args() if options.juncfiles == None: sys.stderr.write("Error: no junction file provided...\n") exit(0) # Get the junction file list libl = [] for junc in open(options.juncfiles): junc = junc.strip() try: open(junc) except: sys.stderr.write("%s does not exist... check your junction files.\n"%junc) exit(0) libl.append(junc) main(options, libl)
尋找差異