Diff
checker
टेक्स्ट
टेक्स्ट
छवियां
दस्तावेज़
Excel
फ़ोल्डर्स
Legal
Enterprise
डेस्कटॉप
मूल्य
साइन इन करें
Diffchecker डेस्कटॉप डाउनलोड करें
टेक्स्ट की तुलना करें
दो टेक्स्ट फ़ाइलों के बीच अंतर ढूंढें
उपकरण
इतिहास
रियल-टाइम एडिटर
अपरिवर्तित संक्षिप्त करें
लाइन रैप बंद
लेआउट
विभाजित
संयुक्त
परिवर्तन हाइलाइट करें
स्मार्ट
शब्द
अक्षर
सिंटैक्स हाइलाइटिंग
सिंटैक्स चुनें
अनदेखा करें
टेक्स्ट बदलें
पहले अंतर पर जाएँ
इनपुट संपादित करें
Diffchecker Desktop
Diffchecker चलाने का सबसे सुरक्षित तरीका। Diffchecker Desktop ऐप पाएं: आपके diffs कभी आपके कंप्यूटर से बाहर नहीं जाते!
Desktop पाएं
leafcutter_cluster.py VS leafcutter_cluster_regtools.py
बनाया गया
6 वर्ष पहले
Diff कभी समाप्त नहीं होता
साफ़
निर्यात करें
शेयर करें
समझाएं
36 हटाए गए
लाइनें
कुल
हटाया गया
अक्षर
कुल
हटाया गया
इस सुविधा का उपयोग जारी रखने के लिए, अपग्रेड करें
Diff
checker
Pro
मूल्य देखें
489 लाइनें
सभी को कॉपी करें
94 जोड़े गए
लाइनें
कुल
जोड़ा गया
अक्षर
कुल
जोड़ा गया
इस सुविधा का उपयोग जारी रखने के लिए, अपग्रेड करें
Diff
checker
Pro
मूल्य देखें
534 लाइनें
सभी को कॉपी करें
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
#!/usr/bin/env python
# https://github.com/griffithlab/regtools
# /home/yangili1/tools/regtools/build/regtools junctions extract -a 8 -i 50 -I 500000 bamfile.bam -o outfile.junc
# Using regtools speeds up the junction extraction step by an order of magnitude or more
import sys
import sys
import tempfile
import tempfile
import os
import os
import gzip
import gzip
import shutil
import shutil
def main(options,libl):
def main(options,libl):
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
pool_junc_reads(libl, options)
if options.cluster == None:
refine_clusters(options)
pool_junc_reads(libl, options)
refine_clusters(options)
sort_junctions(libl, options)
sort_junctions(libl, options)
merge_junctions(options)
merge_junctions(options)
get_numers(options)
get_numers(options)
def pool_junc_reads(flist, options):
def pool_junc_reads(flist, options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
maxIntronLen = int(options.maxintronlen)
maxIntronLen = int(options.maxintronlen)
checkchrom = options.checkchrom
checkchrom = options.checkchrom
useStrand = options.strand
useStrand = options.strand
outFile = "%s/%s_pooled"%(rundir,outPrefix)
outFile = "%s/%s_pooled"%(rundir,outPrefix)
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
by_chrom = {}
by_chrom = {}
for libl in flist:
for libl in flist:
lib = libl.strip()
lib = libl.strip()
if not os.path.isfile(lib):
if not os.path.isfile(lib):
continue
continue
if options.verbose:
if options.verbose:
sys.stderr.write("scanning %s...\n"%lib)
sys.stderr.write("scanning %s...\n"%lib)
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
if lib[-3:] == ".gz": F = gzip.open(lib)
for ln in
open(lib)
:
else: F = open(lib)
for ln in
F
:
lnsplit=ln.split()
lnsplit=ln.split()
if len(lnsplit)<6:
if len(lnsplit)<6:
sys.stderr.write("Error in %s \n" % lib)
sys.stderr.write("Error in %s \n" % lib)
continue
continue
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
chrom, A, B, dot, counts, strand
= lnsplit
chrom, A, B, dot, counts, strand
, rA,rb, rgb, blockCount, blockSize, blockStarts
= lnsplit
if int(blockCount) > 2:
print ln
continue
if not useStrand:
if not useStrand:
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
strand = "NA"
strand = "NA"
if checkchrom and (chrom not in chromLst): continue
if checkchrom and (chrom not in chromLst): continue
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
A, B = int(A)
, int(B)
+1
Aoff, Boff = blockSize.split(",")
A, B = int(A)
+int(Aoff)
, int(B)
-int(Boff)+1
if B-A > int(maxIntronLen): continue
if B-A > int(maxIntronLen): continue
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,
strand)][(A,B)]
try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,
strand)][(A,B)]
except:
except:
try: by_chrom[(chrom,strand)][(A,B)] = int(counts)
try: by_chrom[(chrom,strand)][(A,B)] = int(counts)
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
except: by_chrom[(chrom,
strand)] = {(A,B):int(counts)}
except: by_chrom[(chrom,
strand)] = {(A,B):int(counts)}
fout = file(outFile, 'w')
fout = file(outFile, 'w')
Ncluster = 0
Ncluster = 0
sys.stderr.write("Parsing...\n")
sys.stderr.write("Parsing...\n")
for chrom in by_chrom:
for chrom in by_chrom:
read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads
read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads
read_ks.sort()
read_ks.sort()
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
sys.stderr.write("%s:%s.."%chrom)
sys.stderr.write("%s:%s.."%chrom)
clu = cluster_intervals(read_ks)[0]
clu = cluster_intervals(read_ks)[0]
for cl in clu:
for cl in clu:
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
if len(cl) >
1
: #
if cluster has more than one intron
if len(cl) >
0
: #
1
if cluster has more than one intron
buf = '%s:%s '%chrom
buf = '%s:%s '%chrom
for interval, count in [(x, by_chrom[chrom][x]) for x in cl]:
for interval, count in [(x, by_chrom[chrom][x]) for x in cl]:
buf += "%d:%d" % interval + ":%d"%count+ " "
buf += "%d:%d" % interval + ":%d"%count+ " "
fout.write(buf+'\n')
fout.write(buf+'\n')
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
Ncluster += 1
Ncluster += 1
sys.stderr.write("\nWrote %d clusters..\n"%Ncluster)
sys.stderr.write("\nWrote %d clusters..\n"%Ncluster)
fout.close()
fout.close()
def sort_junctions(libl, options):
def sort_junctions(libl, options):
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
refined_cluster = "%s/%s_refined"%(rundir,outPrefix)
checkchrom = options.checkchrom
checkchrom = options.checkchrom
useStrand = options.strand
useStrand = options.strand
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
if options.cluster == None:
refined_cluster = "%s/%s_refined"%(rundir,outPrefix)
else:
refined_cluster = options.cluster
runName = "%s/%s"%(rundir, outPrefix)
runName = "%s/%s"%(rundir, outPrefix)
exons, cluExons = {}, {}
exons, cluExons = {}, {}
cluN = 0
cluN = 0
for ln in open(refined_cluster):
for ln in open(refined_cluster):
chrom = ln.split()[0]
chrom = ln.split()[0]
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
chrom = tuple(chrom.split(":"))
cluN += 1
cluN += 1
for exon in ln.split()[1:]:
for exon in ln.split()[1:]:
A, B, count = exon.split(":")
A, B, count = exon.split(":")
if chrom not in exons:
if chrom not in exons:
exons[chrom] = {}
exons[chrom] = {}
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
exons[chrom][(int(A),int(B))] = cluN
exons[chrom][(int(A),int(B))] = cluN
if cluN not in cluExons:
if cluN not in cluExons:
cluExons[cluN] = []
cluExons[cluN] = []
cluExons[cluN].append((chrom, A, B))
cluExons[cluN].append((chrom, A, B))
merges = {}
merges = {}
for ll in libl:
for ll in libl:
lib=ll.rstrip()
lib=ll.rstrip()
if not os.path.isfile(lib):
if not os.path.isfile(lib):
continue
continue
libN = lib
libN = lib
if libN not in merges:
if libN not in merges:
merges[libN] = []
merges[libN] = []
merges[libN].append(lib)
merges[libN].append(lib)
fout_runlibs = file(runName+"_sortedlibs",'w')
fout_runlibs = file(runName+"_sortedlibs",'w')
for libN in merges:
for libN in merges:
libName = "%s/%s"%(rundir,libN.split('/')[-1])
libName = "%s/%s"%(rundir,libN.split('/')[-1])
by_chrom = {}
by_chrom = {}
foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1])
foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1])
fout_runlibs.write(foutName+'\n')
fout_runlibs.write(foutName+'\n')
if options.verbose:
if options.verbose:
sys.stderr.write("Sorting %s..\n"%libN)
sys.stderr.write("Sorting %s..\n"%libN)
if len(merges[libN]) > 1:
if len(merges[libN]) > 1:
if options.verbose:
if options.verbose:
sys.stderr.write("merging %s...\n"%(" ".join(merges[libN])))
sys.stderr.write("merging %s...\n"%(" ".join(merges[libN])))
else:
else:
pass
pass
fout = gzip.open(foutName,'w')
fout = gzip.open(foutName,'w')
fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0])
fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0])
for lib in merges[libN]:
for lib in merges[libN]:
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
if lib[-3:] == ".gz": F = gzip.open(lib)
else: F = open(lib)
for ln in
open(lib)
:
for ln in
F
:
lnsplit=ln.split()
lnsplit=ln.split()
if len(lnsplit)<6:
if len(lnsplit)<6:
sys.stderr.write("Error in %s \n" % lib)
sys.stderr.write("Error in %s \n" % lib)
continue
continue
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
chrom,
start, end
, dot, count, strand
= ln
.
split
()
chrom,
A, B
, dot, count, strand
, rA,rb, rgb, blockCount, blockSize, blockStarts
= ln
split
if int(blockCount) > 2:
print ln
continue
if not useStrand:
if not useStrand:
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
strand = "NA"
strand = "NA"
if checkchrom and (chrom not in chromLst): continue
if checkchrom and (chrom not in chromLst): continue
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
chrom = (chrom,
strand)
Aoff, Boff = blockSize.split(",")
A, B = int(A)+int(Aoff), int(B)-int(Boff)+1
chrom = (chrom,
strand)
if chrom not in by_chrom:
if chrom not in by_chrom:
by_chrom[chrom] = {}
by_chrom[chrom] = {}
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
intron = (
int(start), int(end)+1)
intron = (
A, B)
if intron in by_chrom[chrom]:
if intron in by_chrom[chrom]:
by_chrom[chrom][intron] += int(count)
by_chrom[chrom][intron] += int(count)
else:
else:
by_chrom[chrom][intron] = int(count)
by_chrom[chrom][intron] = int(count)
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
for clu in cluExons:
for clu in cluExons:
buf = []
buf = []
ks = cluExons[clu]
ks = cluExons[clu]
ks.sort()
ks.sort()
tot = 0
tot = 0
for exon in ks:
for exon in ks:
chrom, start, end = exon
chrom, start, end = exon
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
chrom = tuple(chrom.split(":"))
start, end = int(start), int(end)
start, end = int(start), int(end)
if chrom not in by_chrom:
if chrom not in by_chrom:
pass
pass
elif (start,end) in by_chrom[chrom]:
elif (start,end) in by_chrom[chrom]:
tot += by_chrom[chrom][(start,end)]
tot += by_chrom[chrom][(start,end)]
for exon in ks:
for exon in ks:
chrom, start, end = exon
chrom, start, end = exon
start, end = int(start), int(end)
start, end = int(start), int(end)
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
chrom = tuple(chrom.split(":"))
chromID, strand = chrom
chromID, strand = chrom
if chrom not in by_chrom:
if chrom not in by_chrom:
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot))
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot))
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
elif (start,end) in by_chrom[chrom]:
elif (start,end) in by_chrom[chrom]:
buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot))
buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot))
else:
else:
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot))
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot))
fout.write("".join(buf))
fout.write("".join(buf))
fout.close()
fout.close()
fout_runlibs.close()
fout_runlibs.close()
def refine_clusters(options):
def refine_clusters(options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
minratio = float(options.mincluratio)
minratio = float(options.mincluratio)
minclureads = int(options.minclureads)
minclureads = int(options.minclureads)
minreads = int(options.minreads)
minreads = int(options.minreads)
inFile = "%s/%s_pooled"%(rundir,outPrefix)
inFile = "%s/%s_pooled"%(rundir,outPrefix)
outFile = "%s/%s_refined"%(rundir,outPrefix)
outFile = "%s/%s_refined"%(rundir,outPrefix)
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
fout = file(outFile,'w')
fout = file(outFile,'w')
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
Ncl = 0
Ncl = 0
for ln in open(inFile):
for ln in open(inFile):
clu = []
clu = []
totN = 0
totN = 0
chrom = ln.split()[0]
chrom = ln.split()[0]
for ex in ln.split()[1:]:
for ex in ln.split()[1:]:
A, B, N = ex.split(":")
A, B, N = ex.split(":")
clu.append(((int(A),int(B)), int(N)))
clu.append(((int(A),int(B)), int(N)))
totN += int(N)
totN += int(N)
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
if totN < minclureads: continue
if totN < minclureads: continue
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
if options.const:
if len(clu) == 1:
buf = '%s ' % chrom
for interval, count in clu:
buf += "%d:%d" % interval + ":%d"%(count)+ " "
Ncl += 1
fout.write(buf+'\n')
for cl in refine_linked(clu):
for cl in refine_linked(clu):
rc = refine_cluster(cl,minratio, minreads)
rc = refine_cluster(cl,minratio, minreads)
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
if len(rc) > 0:
if len(rc) > 0:
for clu in rc:
for clu in rc:
buf = '%s ' % chrom
buf = '%s ' % chrom
for interval, count in clu:
for interval, count in clu:
buf += "%d:%d" % interval + ":%d"%(count)+ " "
buf += "%d:%d" % interval + ":%d"%(count)+ " "
Ncl += 1
Ncl += 1
fout.write(buf+'\n')
fout.write(buf+'\n')
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
sys.stderr.write("Split into %s clusters...\n"%Ncl)
sys.stderr.write("Split into %s clusters...\n"%Ncl)
fout.close()
fout.close()
def merge_junctions(options):
def merge_junctions(options):
''' function to merge junctions '''
''' function to merge junctions '''
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
fnameout = "%s/%s"%(rundir,outPrefix)
fnameout = "%s/%s"%(rundir,outPrefix)
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
flist = "%s/%s_sortedlibs"%(rundir, outPrefix)
flist = "%s/%s_sortedlibs"%(rundir, outPrefix)
lsts = []
lsts = []
for ln in open(flist):
for ln in open(flist):
lsts.append(ln.strip())
lsts.append(ln.strip())
if options.verbose:
if options.verbose:
sys.stderr.write("merging %d junction files...\n"%(len(lsts)))
sys.stderr.write("merging %d junction files...\n"%(len(lsts)))
# Change 300 if max open file is < 300
# Change 300 if max open file is < 300
N = min([300, max([100, int(len(lsts)**(0.5))])])
N = min([300, max([100, int(len(lsts)**(0.5))])])
tmpfiles = []
tmpfiles = []
while len(lsts) > 1:
while len(lsts) > 1:
clst = []
clst = []
for i in range(0,(len(lsts)/N)+1):
for i in range(0,(len(lsts)/N)+1):
lst = lsts[N*i:N*(i+1)]
lst = lsts[N*i:N*(i+1)]
if len(lst) > 0:
if len(lst) > 0:
clst.append(lst)
clst.append(lst)
lsts = []
lsts = []
for lst in clst:
for lst in clst:
if len(lst) == 0: continue
if len(lst) == 0: continue
tmpfile = tempfile.mktemp()
tmpfile = tempfile.mktemp()
os.mkdir(tmpfile)
os.mkdir(tmpfile)
foutname = tmpfile+"/tmpmerge.gz"
foutname = tmpfile+"/tmpmerge.gz"
fout = gzip.open(foutname,'w')
fout = gzip.open(foutname,'w')
merge_files(lst, fout, options)
merge_files(lst, fout, options)
lsts.append(foutname)
lsts.append(foutname)
tmpfiles.append(foutname)
tmpfiles.append(foutname)
fout.close()
fout.close()
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
shutil.move(lsts[0], fnameout+"_perind.counts.gz")
if not options.const:
shutil.move(lsts[0], fnameout+"_perind.counts.gz")
else:
shutil.move(lsts[0], fnameout+"_perind.constcounts.gz")
def merge_files(fnames, fout, options):
def merge_files(fnames, fout, options):
fopen = []
fopen = []
for fname in fnames:
for fname in fnames:
if fname[-3:] == ".gz":
if fname[-3:] == ".gz":
fopen.append(gzip.open(fname))
fopen.append(gzip.open(fname))
else:
else:
fopen.append(open(fname))
fopen.append(open(fname))
finished = False
finished = False
N = 0
N = 0
while not finished:
while not finished:
N += 1
N += 1
if N % 50000 == 0:
if N % 50000 == 0:
sys.stderr.write(".")
sys.stderr.write(".")
buf = []
buf = []
for f in fopen:
for f in fopen:
ln = f.readline().split()
ln = f.readline().split()
if len(ln) == 0:
if len(ln) == 0:
finished = True
finished = True
break
break
chrom = ln[0]
chrom = ln[0]
data = ln[1:]
data = ln[1:]
if len(buf) == 0:
if len(buf) == 0:
buf.append(chrom)
buf.append(chrom)
buf += data
buf += data
if len(buf) > 0:
if len(buf) > 0:
if buf[0] == "chrom":
if buf[0] == "chrom":
if options.verbose:
if options.verbose:
sys.stderr.write("merging %d files"%(len(buf)-1))
sys.stderr.write("merging %d files"%(len(buf)-1))
fout.write(" ".join(buf)+'\n')
fout.write(" ".join(buf)+'\n')
else:
else:
break
break
sys.stderr.write(" done.\n")
sys.stderr.write(" done.\n")
for fin in fopen:
for fin in fopen:
fin.close()
fin.close()
def cluster_intervals(E):
def cluster_intervals(E):
''' Clusters intervals together. '''
''' Clusters intervals together. '''
E.sort()
E.sort()
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
if len(E) == 0:
return [], []
current = E[0]
current = E[0]
Eclusters, cluster = [], []
Eclusters, cluster = [], []
i = 0
i = 0
while i < len(E):
while i < len(E):
if overlaps(E[i], current):
if overlaps(E[i], current):
cluster.append(E[i])
cluster.append(E[i])
else:
else:
Eclusters.append(cluster)
Eclusters.append(cluster)
cluster = [E[i]]
cluster = [E[i]]
current = (E[i][0], max([current[1], E[i][1]]))
current = (E[i][0], max([current[1], E[i][1]]))
i += 1
i += 1
if len(cluster) > 0:
if len(cluster) > 0:
Eclusters.append(cluster)
Eclusters.append(cluster)
return Eclusters, E
return Eclusters, E
def overlaps(A,B):
def overlaps(A,B):
'''
'''
Checks if A and B overlaps
Checks if A and B overlaps
'''
'''
if A[1] < B[0] or B[1] < A[0]:
if A[1] < B[0] or B[1] < A[0]:
return False
return False
else: return True
else: return True
def refine_linked(clusters):
def refine_linked(clusters):
unassigned = [x for x in clusters[1:]]
unassigned = [x for x in clusters[1:]]
current = [clusters[0]]
current = [clusters[0]]
splicesites = set([current[0][0][0],current[0][0][1]])
splicesites = set([current[0][0][0],current[0][0][1]])
newClusters = []
newClusters = []
while len(unassigned) > 0:
while len(unassigned) > 0:
finished = False
finished = False
while not finished:
while not finished:
finished = True
finished = True
torm = []
torm = []
for intron in unassigned:
for intron in unassigned:
inter, count = intron
inter, count = intron
start, end = inter
start, end = inter
if start in splicesites or end in splicesites:
if start in splicesites or end in splicesites:
current.append(intron)
current.append(intron)
splicesites.add(start)
splicesites.add(start)
splicesites.add(end)
splicesites.add(end)
finished = False
finished = False
torm.append(intron)
torm.append(intron)
for intron in torm:
for intron in torm:
unassigned.remove(intron)
unassigned.remove(intron)
newClusters.append(current)
newClusters.append(current)
current = []
current = []
if len(unassigned) > 0:
if len(unassigned) > 0:
current = [unassigned[0]]
current = [unassigned[0]]
splicesites = set([current[0][0][0],current[0][0][1]])
splicesites = set([current[0][0][0],current[0][0][1]])
unassigned = unassigned[1:]
unassigned = unassigned[1:]
return newClusters
return newClusters
def refine_cluster(clu, cutoff, readcutoff):
def refine_cluster(clu, cutoff, readcutoff):
''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff,
''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff,
remove and recluster '''
remove and recluster '''
remove = []
remove = []
dic = {}
dic = {}
intervals = []
intervals = []
reCLU = False
reCLU = False
totN = 0
totN = 0
for inter, count in clu:
for inter, count in clu:
totN += count
totN += count
for inter, count in clu:
for inter, count in clu:
if (count/float(totN) >= cutoff and count >= readcutoff):
if (count/float(totN) >= cutoff and count >= readcutoff):
intervals.append(inter)
intervals.append(inter)
dic[inter] = count
dic[inter] = count
else:
else:
reCLU = True
reCLU = True
if len(intervals) == 0: return []
if len(intervals) == 0: return []
# This makes sure that after trimming, the clusters are still good
# This makes sure that after trimming, the clusters are still good
Atmp, B = cluster_intervals(intervals)
Atmp, B = cluster_intervals(intervals)
A = []
A = []
for cl in Atmp:
for cl in Atmp:
for c in refine_linked([(x,0) for x in cl]):
for c in refine_linked([(x,0) for x in cl]):
if len(c) > 0:
if len(c) > 0:
A.append([x[0] for x in c])
A.append([x[0] for x in c])
if len(A) == 1:
if len(A) == 1:
rc = [(x, dic[x]) for x in A[0]]
rc = [(x, dic[x]) for x in A[0]]
if len(rc) > 1:
if len(rc) > 1:
if reCLU:
if reCLU:
return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff)
return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff)
else:
else:
return [[(x, dic[x]) for x in A[0]]]
return [[(x, dic[x]) for x in A[0]]]
else:
else:
return []
return []
NCs = []
NCs = []
for c in A:
for c in A:
if len(c) > 1:
if len(c) > 1:
NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff)
NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff)
NCs += NC
NCs += NC
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
return NCs
return NCs
def get_numers(options):
def get_numers(options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix)
if not options.const:
fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix)
else:
fname = "%s/%s_perind.constcounts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.constcounts.gz"%(rundir,outPrefix)
input_file=gzip.open(fname, 'rb')
input_file=gzip.open(fname, 'rb')
fout = gzip.open(fnameout,'w')
fout = gzip.open(fnameout,'w')
first_line=True
first_line=True
for l in input_file:
for l in input_file:
if first_line:
if first_line:
fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names
fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names
first_line=False
first_line=False
else:
else:
l=l.strip()
l=l.strip()
words=l.split(" ")
words=l.split(" ")
fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n')
fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n')
input_file.close()
input_file.close()
fout.close()
fout.close()
if __name__ == "__main__":
if __name__ == "__main__":
from optparse import OptionParser
from optparse import OptionParser
parser = OptionParser()
parser = OptionParser()
parser.add_option("-j", "--juncfiles", dest="juncfiles",
parser.add_option("-j", "--juncfiles", dest="juncfiles",
help="text file with all junction files to be processed")
help="text file with all junction files to be processed")
parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter',
parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter',
help="output prefix (default leafcutter)")
help="output prefix (default leafcutter)")
parser.add_option("-q", "--quiet",
parser.add_option("-q", "--quiet",
action="store_false", dest="verbose", default=True,
action="store_false", dest="verbose", default=True,
help="don't print status messages to stdout")
help="don't print status messages to stdout")
parser.add_option("-r", "--rundir", dest="rundir", default='./',
parser.add_option("-r", "--rundir", dest="rundir", default='./',
help="write to directory (default ./)")
help="write to directory (default ./)")
parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000,
parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000,
help="maximum intron length in bp (default 100,000bp)")
help="maximum intron length in bp (default 100,000bp)")
parser.add_option("-m", "--minclureads", dest="minclureads", default = 30,
parser.add_option("-m", "--minclureads", dest="minclureads", default = 30,
help="minimum reads in a cluster (default 30 reads)")
help="minimum reads in a cluster (default 30 reads)")
parser.add_option("-M", "--minreads", dest="minreads", default = 5,
parser.add_option("-M", "--minreads", dest="minreads", default = 5,
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
help="minimum reads
for an intron
(default 5 reads)")
help="minimum reads
in a cluster
(default 5 reads)")
parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001,
parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001,
help="minimum fraction of reads in a cluster that support a junction (default 0.001)")
help="minimum fraction of reads in a cluster that support a junction (default 0.001)")
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
parser.add_option("-k", "--checkchrom", dest="checkchrom", default = False,
help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ... (default=False)")
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
parser.add_option("-
s
", "--
strand
", dest="
strand",
default = False,
parser.add_option("-
c
", "--
cluster
", dest="
cluster", default = None,
help="
use strand info (
default
=
False
)
")
help="refined cluster file when clusters are already made")
parser.add_option("-k", "--checkchrom", dest="checkchrom", action="store_true",
default = False,
help="
check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ...")
parser.add_option("-C", "--includeconst", dest="const", action="store_true",
default
=
False
,
help="also include constitutive introns
")
कॉपी
कॉपी हुआ
कॉपी
कॉपी हुआ
parser.add_option("-s", "--strand", dest="strand", default = True,
help="use strand info (default=True)")
(options, args) = parser.parse_args()
(options, args) = parser.parse_args()
if options.juncfiles == None:
if options.juncfiles == None:
sys.stderr.write("Error: no junction file provided...\n")
sys.stderr.write("Error: no junction file provided...\n")
exit(0)
exit(0)
# Get the junction file list
# Get the junction file list
libl = []
libl = []
for junc in open(options.juncfiles):
for junc in open(options.juncfiles):
junc = junc.strip()
junc = junc.strip()
try:
try:
open(junc)
open(junc)
except:
except:
sys.stderr.write("%s does not exist... check your junction files.\n"%junc)
sys.stderr.write("%s does not exist... check your junction files.\n"%junc)
exit(0)
exit(0)
libl.append(junc)
libl.append(junc)
main(options, libl)
main(options, libl)
सेव किए गए Diffs
ऑरिजनल टेक्स्ट
फ़ाइल खोलें
#!/usr/bin/env python import sys import tempfile import os import gzip import shutil def main(options,libl): pool_junc_reads(libl, options) refine_clusters(options) sort_junctions(libl, options) merge_junctions(options) get_numers(options) def pool_junc_reads(flist, options): outPrefix = options.outprefix rundir = options.rundir maxIntronLen = int(options.maxintronlen) checkchrom = options.checkchrom useStrand = options.strand outFile = "%s/%s_pooled"%(rundir,outPrefix) chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] by_chrom = {} for libl in flist: lib = libl.strip() if not os.path.isfile(lib): continue if options.verbose: sys.stderr.write("scanning %s...\n"%lib) if lib[-3:] == ".gz": F = gzip.open(lib) else: F = open(lib) for ln in F: lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, counts, strand = lnsplit if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue A, B = int(A), int(B)+1 if B-A > int(maxIntronLen): continue try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom, strand)][(A,B)] except: try: by_chrom[(chrom,strand)][(A,B)] = int(counts) except: by_chrom[(chrom, strand)] = {(A,B):int(counts)} fout = file(outFile, 'w') Ncluster = 0 sys.stderr.write("Parsing...\n") for chrom in by_chrom: read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads read_ks.sort() sys.stderr.write("%s:%s.."%chrom) clu = cluster_intervals(read_ks)[0] for cl in clu: if len(cl) > 1: # if cluster has more than one intron buf = '%s:%s '%chrom for interval, count in [(x, by_chrom[chrom][x]) for x in cl]: buf += "%d:%d" % interval + ":%d"%count+ " " fout.write(buf+'\n') Ncluster += 1 sys.stderr.write("\nWrote %d clusters..\n"%Ncluster) fout.close() def sort_junctions(libl, options): chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] outPrefix = options.outprefix rundir = options.rundir refined_cluster = "%s/%s_refined"%(rundir,outPrefix) checkchrom = options.checkchrom useStrand = options.strand runName = "%s/%s"%(rundir, outPrefix) exons, cluExons = {}, {} cluN = 0 for ln in open(refined_cluster): chrom = ln.split()[0] chrom = tuple(chrom.split(":")) cluN += 1 for exon in ln.split()[1:]: A, B, count = exon.split(":") if chrom not in exons: exons[chrom] = {} exons[chrom][(int(A),int(B))] = cluN if cluN not in cluExons: cluExons[cluN] = [] cluExons[cluN].append((chrom, A, B)) merges = {} for ll in libl: lib=ll.rstrip() if not os.path.isfile(lib): continue libN = lib if libN not in merges: merges[libN] = [] merges[libN].append(lib) fout_runlibs = file(runName+"_sortedlibs",'w') for libN in merges: libName = "%s/%s"%(rundir,libN.split('/')[-1]) by_chrom = {} foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1]) fout_runlibs.write(foutName+'\n') if options.verbose: sys.stderr.write("Sorting %s..\n"%libN) if len(merges[libN]) > 1: if options.verbose: sys.stderr.write("merging %s...\n"%(" ".join(merges[libN]))) else: pass fout = gzip.open(foutName,'w') fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0]) for lib in merges[libN]: if lib[-3:] == ".gz": F = gzip.open(lib) else: F = open(lib) for ln in F: lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, start, end, dot, count, strand = ln.split() if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue chrom = (chrom, strand) if chrom not in by_chrom: by_chrom[chrom] = {} intron = (int(start), int(end)+1) if intron in by_chrom[chrom]: by_chrom[chrom][intron] += int(count) else: by_chrom[chrom][intron] = int(count) for clu in cluExons: buf = [] ks = cluExons[clu] ks.sort() tot = 0 for exon in ks: chrom, start, end = exon start, end = int(start), int(end) if chrom not in by_chrom: pass elif (start,end) in by_chrom[chrom]: tot += by_chrom[chrom][(start,end)] for exon in ks: chrom, start, end = exon start, end = int(start), int(end) chromID, strand = chrom if chrom not in by_chrom: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot)) elif (start,end) in by_chrom[chrom]: buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot)) else: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot)) fout.write("".join(buf)) fout.close() fout_runlibs.close() def refine_clusters(options): outPrefix = options.outprefix rundir = options.rundir minratio = float(options.mincluratio) minclureads = int(options.minclureads) minreads = int(options.minreads) inFile = "%s/%s_pooled"%(rundir,outPrefix) outFile = "%s/%s_refined"%(rundir,outPrefix) fout = file(outFile,'w') Ncl = 0 for ln in open(inFile): clu = [] totN = 0 chrom = ln.split()[0] for ex in ln.split()[1:]: A, B, N = ex.split(":") clu.append(((int(A),int(B)), int(N))) totN += int(N) if totN < minclureads: continue for cl in refine_linked(clu): rc = refine_cluster(cl,minratio, minreads) if len(rc) > 0: for clu in rc: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') sys.stderr.write("Split into %s clusters...\n"%Ncl) fout.close() def merge_junctions(options): ''' function to merge junctions ''' outPrefix = options.outprefix rundir = options.rundir fnameout = "%s/%s"%(rundir,outPrefix) flist = "%s/%s_sortedlibs"%(rundir, outPrefix) lsts = [] for ln in open(flist): lsts.append(ln.strip()) if options.verbose: sys.stderr.write("merging %d junction files...\n"%(len(lsts))) # Change 300 if max open file is < 300 N = min([300, max([100, int(len(lsts)**(0.5))])]) tmpfiles = [] while len(lsts) > 1: clst = [] for i in range(0,(len(lsts)/N)+1): lst = lsts[N*i:N*(i+1)] if len(lst) > 0: clst.append(lst) lsts = [] for lst in clst: if len(lst) == 0: continue tmpfile = tempfile.mktemp() os.mkdir(tmpfile) foutname = tmpfile+"/tmpmerge.gz" fout = gzip.open(foutname,'w') merge_files(lst, fout, options) lsts.append(foutname) tmpfiles.append(foutname) fout.close() shutil.move(lsts[0], fnameout+"_perind.counts.gz") def merge_files(fnames, fout, options): fopen = [] for fname in fnames: if fname[-3:] == ".gz": fopen.append(gzip.open(fname)) else: fopen.append(open(fname)) finished = False N = 0 while not finished: N += 1 if N % 50000 == 0: sys.stderr.write(".") buf = [] for f in fopen: ln = f.readline().split() if len(ln) == 0: finished = True break chrom = ln[0] data = ln[1:] if len(buf) == 0: buf.append(chrom) buf += data if len(buf) > 0: if buf[0] == "chrom": if options.verbose: sys.stderr.write("merging %d files"%(len(buf)-1)) fout.write(" ".join(buf)+'\n') else: break sys.stderr.write(" done.\n") for fin in fopen: fin.close() def cluster_intervals(E): ''' Clusters intervals together. ''' E.sort() if len(E) == 0: return [], [] current = E[0] Eclusters, cluster = [], [] i = 0 while i < len(E): if overlaps(E[i], current): cluster.append(E[i]) else: Eclusters.append(cluster) cluster = [E[i]] current = (E[i][0], max([current[1], E[i][1]])) i += 1 if len(cluster) > 0: Eclusters.append(cluster) return Eclusters, E def overlaps(A,B): ''' Checks if A and B overlaps ''' if A[1] < B[0] or B[1] < A[0]: return False else: return True def refine_linked(clusters): unassigned = [x for x in clusters[1:]] current = [clusters[0]] splicesites = set([current[0][0][0],current[0][0][1]]) newClusters = [] while len(unassigned) > 0: finished = False while not finished: finished = True torm = [] for intron in unassigned: inter, count = intron start, end = inter if start in splicesites or end in splicesites: current.append(intron) splicesites.add(start) splicesites.add(end) finished = False torm.append(intron) for intron in torm: unassigned.remove(intron) newClusters.append(current) current = [] if len(unassigned) > 0: current = [unassigned[0]] splicesites = set([current[0][0][0],current[0][0][1]]) unassigned = unassigned[1:] return newClusters def refine_cluster(clu, cutoff, readcutoff): ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff, remove and recluster ''' remove = [] dic = {} intervals = [] reCLU = False totN = 0 for inter, count in clu: totN += count for inter, count in clu: if (count/float(totN) >= cutoff and count >= readcutoff): intervals.append(inter) dic[inter] = count else: reCLU = True if len(intervals) == 0: return [] # This makes sure that after trimming, the clusters are still good Atmp, B = cluster_intervals(intervals) A = [] for cl in Atmp: for c in refine_linked([(x,0) for x in cl]): if len(c) > 0: A.append([x[0] for x in c]) if len(A) == 1: rc = [(x, dic[x]) for x in A[0]] if len(rc) > 1: if reCLU: return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff) else: return [[(x, dic[x]) for x in A[0]]] else: return [] NCs = [] for c in A: if len(c) > 1: NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff) NCs += NC return NCs def get_numers(options): outPrefix = options.outprefix rundir = options.rundir fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix) input_file=gzip.open(fname, 'rb') fout = gzip.open(fnameout,'w') first_line=True for l in input_file: if first_line: fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names first_line=False else: l=l.strip() words=l.split(" ") fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n') input_file.close() fout.close() if __name__ == "__main__": from optparse import OptionParser parser = OptionParser() parser.add_option("-j", "--juncfiles", dest="juncfiles", help="text file with all junction files to be processed") parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter', help="output prefix (default leafcutter)") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") parser.add_option("-r", "--rundir", dest="rundir", default='./', help="write to directory (default ./)") parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, help="maximum intron length in bp (default 100,000bp)") parser.add_option("-m", "--minclureads", dest="minclureads", default = 30, help="minimum reads in a cluster (default 30 reads)") parser.add_option("-M", "--minreads", dest="minreads", default = 5, help="minimum reads for an intron (default 5 reads)") parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001, help="minimum fraction of reads in a cluster that support a junction (default 0.001)") parser.add_option("-k", "--checkchrom", dest="checkchrom", default = False, help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ... (default=False)") parser.add_option("-s", "--strand", dest="strand", default = False, help="use strand info (default=False)") (options, args) = parser.parse_args() if options.juncfiles == None: sys.stderr.write("Error: no junction file provided...\n") exit(0) # Get the junction file list libl = [] for junc in open(options.juncfiles): junc = junc.strip() try: open(junc) except: sys.stderr.write("%s does not exist... check your junction files.\n"%junc) exit(0) libl.append(junc) main(options, libl)
परिवर्तित टेक्स्ट
फ़ाइल खोलें
# https://github.com/griffithlab/regtools # /home/yangili1/tools/regtools/build/regtools junctions extract -a 8 -i 50 -I 500000 bamfile.bam -o outfile.junc # Using regtools speeds up the junction extraction step by an order of magnitude or more import sys import tempfile import os import gzip import shutil def main(options,libl): if options.cluster == None: pool_junc_reads(libl, options) refine_clusters(options) sort_junctions(libl, options) merge_junctions(options) get_numers(options) def pool_junc_reads(flist, options): outPrefix = options.outprefix rundir = options.rundir maxIntronLen = int(options.maxintronlen) checkchrom = options.checkchrom useStrand = options.strand outFile = "%s/%s_pooled"%(rundir,outPrefix) chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] by_chrom = {} for libl in flist: lib = libl.strip() if not os.path.isfile(lib): continue if options.verbose: sys.stderr.write("scanning %s...\n"%lib) for ln in open(lib): lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, counts, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit if int(blockCount) > 2: print ln continue if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue Aoff, Boff = blockSize.split(",") A, B = int(A)+int(Aoff), int(B)-int(Boff)+1 if B-A > int(maxIntronLen): continue try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,strand)][(A,B)] except: try: by_chrom[(chrom,strand)][(A,B)] = int(counts) except: by_chrom[(chrom,strand)] = {(A,B):int(counts)} fout = file(outFile, 'w') Ncluster = 0 sys.stderr.write("Parsing...\n") for chrom in by_chrom: read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads read_ks.sort() sys.stderr.write("%s:%s.."%chrom) clu = cluster_intervals(read_ks)[0] for cl in clu: if len(cl) > 0: # 1 if cluster has more than one intron buf = '%s:%s '%chrom for interval, count in [(x, by_chrom[chrom][x]) for x in cl]: buf += "%d:%d" % interval + ":%d"%count+ " " fout.write(buf+'\n') Ncluster += 1 sys.stderr.write("\nWrote %d clusters..\n"%Ncluster) fout.close() def sort_junctions(libl, options): chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] outPrefix = options.outprefix rundir = options.rundir checkchrom = options.checkchrom useStrand = options.strand if options.cluster == None: refined_cluster = "%s/%s_refined"%(rundir,outPrefix) else: refined_cluster = options.cluster runName = "%s/%s"%(rundir, outPrefix) exons, cluExons = {}, {} cluN = 0 for ln in open(refined_cluster): chrom = ln.split()[0] cluN += 1 for exon in ln.split()[1:]: A, B, count = exon.split(":") if chrom not in exons: exons[chrom] = {} exons[chrom][(int(A),int(B))] = cluN if cluN not in cluExons: cluExons[cluN] = [] cluExons[cluN].append((chrom, A, B)) merges = {} for ll in libl: lib=ll.rstrip() if not os.path.isfile(lib): continue libN = lib if libN not in merges: merges[libN] = [] merges[libN].append(lib) fout_runlibs = file(runName+"_sortedlibs",'w') for libN in merges: libName = "%s/%s"%(rundir,libN.split('/')[-1]) by_chrom = {} foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1]) fout_runlibs.write(foutName+'\n') if options.verbose: sys.stderr.write("Sorting %s..\n"%libN) if len(merges[libN]) > 1: if options.verbose: sys.stderr.write("merging %s...\n"%(" ".join(merges[libN]))) else: pass fout = gzip.open(foutName,'w') fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0]) for lib in merges[libN]: for ln in open(lib): lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, count, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit if int(blockCount) > 2: print ln continue if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue Aoff, Boff = blockSize.split(",") A, B = int(A)+int(Aoff), int(B)-int(Boff)+1 chrom = (chrom,strand) if chrom not in by_chrom: by_chrom[chrom] = {} intron = (A, B) if intron in by_chrom[chrom]: by_chrom[chrom][intron] += int(count) else: by_chrom[chrom][intron] = int(count) for clu in cluExons: buf = [] ks = cluExons[clu] ks.sort() tot = 0 for exon in ks: chrom, start, end = exon chrom = tuple(chrom.split(":")) start, end = int(start), int(end) if chrom not in by_chrom: pass elif (start,end) in by_chrom[chrom]: tot += by_chrom[chrom][(start,end)] for exon in ks: chrom, start, end = exon start, end = int(start), int(end) chrom = tuple(chrom.split(":")) chromID, strand = chrom if chrom not in by_chrom: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot)) elif (start,end) in by_chrom[chrom]: buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot)) else: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot)) fout.write("".join(buf)) fout.close() fout_runlibs.close() def refine_clusters(options): outPrefix = options.outprefix rundir = options.rundir minratio = float(options.mincluratio) minclureads = int(options.minclureads) minreads = int(options.minreads) inFile = "%s/%s_pooled"%(rundir,outPrefix) outFile = "%s/%s_refined"%(rundir,outPrefix) fout = file(outFile,'w') Ncl = 0 for ln in open(inFile): clu = [] totN = 0 chrom = ln.split()[0] for ex in ln.split()[1:]: A, B, N = ex.split(":") clu.append(((int(A),int(B)), int(N))) totN += int(N) if totN < minclureads: continue if options.const: if len(clu) == 1: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') for cl in refine_linked(clu): rc = refine_cluster(cl,minratio, minreads) if len(rc) > 0: for clu in rc: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') sys.stderr.write("Split into %s clusters...\n"%Ncl) fout.close() def merge_junctions(options): ''' function to merge junctions ''' outPrefix = options.outprefix rundir = options.rundir fnameout = "%s/%s"%(rundir,outPrefix) flist = "%s/%s_sortedlibs"%(rundir, outPrefix) lsts = [] for ln in open(flist): lsts.append(ln.strip()) if options.verbose: sys.stderr.write("merging %d junction files...\n"%(len(lsts))) # Change 300 if max open file is < 300 N = min([300, max([100, int(len(lsts)**(0.5))])]) tmpfiles = [] while len(lsts) > 1: clst = [] for i in range(0,(len(lsts)/N)+1): lst = lsts[N*i:N*(i+1)] if len(lst) > 0: clst.append(lst) lsts = [] for lst in clst: if len(lst) == 0: continue tmpfile = tempfile.mktemp() os.mkdir(tmpfile) foutname = tmpfile+"/tmpmerge.gz" fout = gzip.open(foutname,'w') merge_files(lst, fout, options) lsts.append(foutname) tmpfiles.append(foutname) fout.close() if not options.const: shutil.move(lsts[0], fnameout+"_perind.counts.gz") else: shutil.move(lsts[0], fnameout+"_perind.constcounts.gz") def merge_files(fnames, fout, options): fopen = [] for fname in fnames: if fname[-3:] == ".gz": fopen.append(gzip.open(fname)) else: fopen.append(open(fname)) finished = False N = 0 while not finished: N += 1 if N % 50000 == 0: sys.stderr.write(".") buf = [] for f in fopen: ln = f.readline().split() if len(ln) == 0: finished = True break chrom = ln[0] data = ln[1:] if len(buf) == 0: buf.append(chrom) buf += data if len(buf) > 0: if buf[0] == "chrom": if options.verbose: sys.stderr.write("merging %d files"%(len(buf)-1)) fout.write(" ".join(buf)+'\n') else: break sys.stderr.write(" done.\n") for fin in fopen: fin.close() def cluster_intervals(E): ''' Clusters intervals together. ''' E.sort() current = E[0] Eclusters, cluster = [], [] i = 0 while i < len(E): if overlaps(E[i], current): cluster.append(E[i]) else: Eclusters.append(cluster) cluster = [E[i]] current = (E[i][0], max([current[1], E[i][1]])) i += 1 if len(cluster) > 0: Eclusters.append(cluster) return Eclusters, E def overlaps(A,B): ''' Checks if A and B overlaps ''' if A[1] < B[0] or B[1] < A[0]: return False else: return True def refine_linked(clusters): unassigned = [x for x in clusters[1:]] current = [clusters[0]] splicesites = set([current[0][0][0],current[0][0][1]]) newClusters = [] while len(unassigned) > 0: finished = False while not finished: finished = True torm = [] for intron in unassigned: inter, count = intron start, end = inter if start in splicesites or end in splicesites: current.append(intron) splicesites.add(start) splicesites.add(end) finished = False torm.append(intron) for intron in torm: unassigned.remove(intron) newClusters.append(current) current = [] if len(unassigned) > 0: current = [unassigned[0]] splicesites = set([current[0][0][0],current[0][0][1]]) unassigned = unassigned[1:] return newClusters def refine_cluster(clu, cutoff, readcutoff): ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff, remove and recluster ''' remove = [] dic = {} intervals = [] reCLU = False totN = 0 for inter, count in clu: totN += count for inter, count in clu: if (count/float(totN) >= cutoff and count >= readcutoff): intervals.append(inter) dic[inter] = count else: reCLU = True if len(intervals) == 0: return [] # This makes sure that after trimming, the clusters are still good Atmp, B = cluster_intervals(intervals) A = [] for cl in Atmp: for c in refine_linked([(x,0) for x in cl]): if len(c) > 0: A.append([x[0] for x in c]) if len(A) == 1: rc = [(x, dic[x]) for x in A[0]] if len(rc) > 1: if reCLU: return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff) else: return [[(x, dic[x]) for x in A[0]]] else: return [] NCs = [] for c in A: if len(c) > 1: NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff) NCs += NC return NCs def get_numers(options): outPrefix = options.outprefix rundir = options.rundir if not options.const: fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix) else: fname = "%s/%s_perind.constcounts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.constcounts.gz"%(rundir,outPrefix) input_file=gzip.open(fname, 'rb') fout = gzip.open(fnameout,'w') first_line=True for l in input_file: if first_line: fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names first_line=False else: l=l.strip() words=l.split(" ") fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n') input_file.close() fout.close() if __name__ == "__main__": from optparse import OptionParser parser = OptionParser() parser.add_option("-j", "--juncfiles", dest="juncfiles", help="text file with all junction files to be processed") parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter', help="output prefix (default leafcutter)") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") parser.add_option("-r", "--rundir", dest="rundir", default='./', help="write to directory (default ./)") parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, help="maximum intron length in bp (default 100,000bp)") parser.add_option("-m", "--minclureads", dest="minclureads", default = 30, help="minimum reads in a cluster (default 30 reads)") parser.add_option("-M", "--minreads", dest="minreads", default = 5, help="minimum reads in a cluster (default 5 reads)") parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001, help="minimum fraction of reads in a cluster that support a junction (default 0.001)") parser.add_option("-c", "--cluster", dest="cluster", default = None, help="refined cluster file when clusters are already made") parser.add_option("-k", "--checkchrom", dest="checkchrom", action="store_true",default = False, help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ...") parser.add_option("-C", "--includeconst", dest="const", action="store_true",default = False, help="also include constitutive introns") parser.add_option("-s", "--strand", dest="strand", default = True, help="use strand info (default=True)") (options, args) = parser.parse_args() if options.juncfiles == None: sys.stderr.write("Error: no junction file provided...\n") exit(0) # Get the junction file list libl = [] for junc in open(options.juncfiles): junc = junc.strip() try: open(junc) except: sys.stderr.write("%s does not exist... check your junction files.\n"%junc) exit(0) libl.append(junc) main(options, libl)
अंतर खोजें