Diff
checker
Testo
Testo
Immagini
Documenti
Excel
Cartelle
Legal
Enterprise
Applicazione per desktop
Prezzi
Accedi
Scarica Diffchecker Desktop
Confronta il testo
Trova la differenza tra due file di testo
Strumenti
Cronologia
Editor live
Comprimi invariate
Senza a capo
Layout
Diviso
Unificato
Livello di dettaglio
Intelligente
Parola
Carattere
Evidenziazione sintassi
Scegli sintassi
Ignora
Trasforma testo
Vai alla prima modifica
Modifica input
Diffchecker Desktop
Il modo più sicuro per usare Diffchecker. Ottieni l'app Diffchecker Desktop: i tuoi diff non lasciano mai il tuo computer!
Ottieni Desktop
leafcutter_cluster.py VS leafcutter_cluster_regtools.py
Creato
6 anni fa
Il diff non scade mai
Eliminare
Esporta
Condividere
Spiegare
36 rimozioni
Linee
Totale
Rimosso
Caratteri
Totale
Rimosso
Per continuare a utilizzare questa funzione, aggiorna a
Diff
checker
Pro
Visualizza prezzi
489 linee
Copia tutti
94 aggiunte
Linee
Totale
Aggiunto
Caratteri
Totale
Aggiunto
Per continuare a utilizzare questa funzione, aggiorna a
Diff
checker
Pro
Visualizza prezzi
534 linee
Copia tutti
Copia
Copiato
Copia
Copiato
#!/usr/bin/env python
# https://github.com/griffithlab/regtools
# /home/yangili1/tools/regtools/build/regtools junctions extract -a 8 -i 50 -I 500000 bamfile.bam -o outfile.junc
# Using regtools speeds up the junction extraction step by an order of magnitude or more
import sys
import sys
import tempfile
import tempfile
import os
import os
import gzip
import gzip
import shutil
import shutil
def main(options,libl):
def main(options,libl):
Copia
Copiato
Copia
Copiato
pool_junc_reads(libl, options)
if options.cluster == None:
refine_clusters(options)
pool_junc_reads(libl, options)
refine_clusters(options)
sort_junctions(libl, options)
sort_junctions(libl, options)
merge_junctions(options)
merge_junctions(options)
get_numers(options)
get_numers(options)
def pool_junc_reads(flist, options):
def pool_junc_reads(flist, options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
maxIntronLen = int(options.maxintronlen)
maxIntronLen = int(options.maxintronlen)
checkchrom = options.checkchrom
checkchrom = options.checkchrom
useStrand = options.strand
useStrand = options.strand
outFile = "%s/%s_pooled"%(rundir,outPrefix)
outFile = "%s/%s_pooled"%(rundir,outPrefix)
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
by_chrom = {}
by_chrom = {}
for libl in flist:
for libl in flist:
lib = libl.strip()
lib = libl.strip()
if not os.path.isfile(lib):
if not os.path.isfile(lib):
continue
continue
if options.verbose:
if options.verbose:
sys.stderr.write("scanning %s...\n"%lib)
sys.stderr.write("scanning %s...\n"%lib)
Copia
Copiato
Copia
Copiato
if lib[-3:] == ".gz": F = gzip.open(lib)
for ln in
open(lib)
:
else: F = open(lib)
for ln in
F
:
lnsplit=ln.split()
lnsplit=ln.split()
if len(lnsplit)<6:
if len(lnsplit)<6:
sys.stderr.write("Error in %s \n" % lib)
sys.stderr.write("Error in %s \n" % lib)
continue
continue
Copia
Copiato
Copia
Copiato
chrom, A, B, dot, counts, strand
= lnsplit
chrom, A, B, dot, counts, strand
, rA,rb, rgb, blockCount, blockSize, blockStarts
= lnsplit
if int(blockCount) > 2:
print ln
continue
if not useStrand:
if not useStrand:
Copia
Copiato
Copia
Copiato
strand = "NA"
strand = "NA"
if checkchrom and (chrom not in chromLst): continue
if checkchrom and (chrom not in chromLst): continue
Copia
Copiato
Copia
Copiato
A, B = int(A)
, int(B)
+1
Aoff, Boff = blockSize.split(",")
A, B = int(A)
+int(Aoff)
, int(B)
-int(Boff)+1
if B-A > int(maxIntronLen): continue
if B-A > int(maxIntronLen): continue
Copia
Copiato
Copia
Copiato
try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,
strand)][(A,B)]
try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,
strand)][(A,B)]
except:
except:
try: by_chrom[(chrom,strand)][(A,B)] = int(counts)
try: by_chrom[(chrom,strand)][(A,B)] = int(counts)
Copia
Copiato
Copia
Copiato
except: by_chrom[(chrom,
strand)] = {(A,B):int(counts)}
except: by_chrom[(chrom,
strand)] = {(A,B):int(counts)}
fout = file(outFile, 'w')
fout = file(outFile, 'w')
Ncluster = 0
Ncluster = 0
sys.stderr.write("Parsing...\n")
sys.stderr.write("Parsing...\n")
for chrom in by_chrom:
for chrom in by_chrom:
read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads
read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads
read_ks.sort()
read_ks.sort()
Copia
Copiato
Copia
Copiato
sys.stderr.write("%s:%s.."%chrom)
sys.stderr.write("%s:%s.."%chrom)
clu = cluster_intervals(read_ks)[0]
clu = cluster_intervals(read_ks)[0]
for cl in clu:
for cl in clu:
Copia
Copiato
Copia
Copiato
if len(cl) >
1
: #
if cluster has more than one intron
if len(cl) >
0
: #
1
if cluster has more than one intron
buf = '%s:%s '%chrom
buf = '%s:%s '%chrom
for interval, count in [(x, by_chrom[chrom][x]) for x in cl]:
for interval, count in [(x, by_chrom[chrom][x]) for x in cl]:
buf += "%d:%d" % interval + ":%d"%count+ " "
buf += "%d:%d" % interval + ":%d"%count+ " "
fout.write(buf+'\n')
fout.write(buf+'\n')
Copia
Copiato
Copia
Copiato
Ncluster += 1
Ncluster += 1
sys.stderr.write("\nWrote %d clusters..\n"%Ncluster)
sys.stderr.write("\nWrote %d clusters..\n"%Ncluster)
fout.close()
fout.close()
def sort_junctions(libl, options):
def sort_junctions(libl, options):
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
Copia
Copiato
Copia
Copiato
refined_cluster = "%s/%s_refined"%(rundir,outPrefix)
checkchrom = options.checkchrom
checkchrom = options.checkchrom
useStrand = options.strand
useStrand = options.strand
Copia
Copiato
Copia
Copiato
if options.cluster == None:
refined_cluster = "%s/%s_refined"%(rundir,outPrefix)
else:
refined_cluster = options.cluster
runName = "%s/%s"%(rundir, outPrefix)
runName = "%s/%s"%(rundir, outPrefix)
exons, cluExons = {}, {}
exons, cluExons = {}, {}
cluN = 0
cluN = 0
for ln in open(refined_cluster):
for ln in open(refined_cluster):
chrom = ln.split()[0]
chrom = ln.split()[0]
Copia
Copiato
Copia
Copiato
chrom = tuple(chrom.split(":"))
cluN += 1
cluN += 1
for exon in ln.split()[1:]:
for exon in ln.split()[1:]:
A, B, count = exon.split(":")
A, B, count = exon.split(":")
if chrom not in exons:
if chrom not in exons:
exons[chrom] = {}
exons[chrom] = {}
Copia
Copiato
Copia
Copiato
exons[chrom][(int(A),int(B))] = cluN
exons[chrom][(int(A),int(B))] = cluN
if cluN not in cluExons:
if cluN not in cluExons:
cluExons[cluN] = []
cluExons[cluN] = []
cluExons[cluN].append((chrom, A, B))
cluExons[cluN].append((chrom, A, B))
merges = {}
merges = {}
for ll in libl:
for ll in libl:
lib=ll.rstrip()
lib=ll.rstrip()
if not os.path.isfile(lib):
if not os.path.isfile(lib):
continue
continue
libN = lib
libN = lib
if libN not in merges:
if libN not in merges:
merges[libN] = []
merges[libN] = []
merges[libN].append(lib)
merges[libN].append(lib)
fout_runlibs = file(runName+"_sortedlibs",'w')
fout_runlibs = file(runName+"_sortedlibs",'w')
for libN in merges:
for libN in merges:
libName = "%s/%s"%(rundir,libN.split('/')[-1])
libName = "%s/%s"%(rundir,libN.split('/')[-1])
by_chrom = {}
by_chrom = {}
foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1])
foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1])
fout_runlibs.write(foutName+'\n')
fout_runlibs.write(foutName+'\n')
if options.verbose:
if options.verbose:
sys.stderr.write("Sorting %s..\n"%libN)
sys.stderr.write("Sorting %s..\n"%libN)
if len(merges[libN]) > 1:
if len(merges[libN]) > 1:
if options.verbose:
if options.verbose:
sys.stderr.write("merging %s...\n"%(" ".join(merges[libN])))
sys.stderr.write("merging %s...\n"%(" ".join(merges[libN])))
else:
else:
pass
pass
fout = gzip.open(foutName,'w')
fout = gzip.open(foutName,'w')
fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0])
fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0])
for lib in merges[libN]:
for lib in merges[libN]:
Copia
Copiato
Copia
Copiato
if lib[-3:] == ".gz": F = gzip.open(lib)
else: F = open(lib)
for ln in
open(lib)
:
for ln in
F
:
lnsplit=ln.split()
lnsplit=ln.split()
if len(lnsplit)<6:
if len(lnsplit)<6:
sys.stderr.write("Error in %s \n" % lib)
sys.stderr.write("Error in %s \n" % lib)
continue
continue
Copia
Copiato
Copia
Copiato
chrom,
start, end
, dot, count, strand
= ln
.
split
()
chrom,
A, B
, dot, count, strand
, rA,rb, rgb, blockCount, blockSize, blockStarts
= ln
split
if int(blockCount) > 2:
print ln
continue
if not useStrand:
if not useStrand:
Copia
Copiato
Copia
Copiato
strand = "NA"
strand = "NA"
if checkchrom and (chrom not in chromLst): continue
if checkchrom and (chrom not in chromLst): continue
Copia
Copiato
Copia
Copiato
chrom = (chrom,
strand)
Aoff, Boff = blockSize.split(",")
A, B = int(A)+int(Aoff), int(B)-int(Boff)+1
chrom = (chrom,
strand)
if chrom not in by_chrom:
if chrom not in by_chrom:
by_chrom[chrom] = {}
by_chrom[chrom] = {}
Copia
Copiato
Copia
Copiato
intron = (
int(start), int(end)+1)
intron = (
A, B)
if intron in by_chrom[chrom]:
if intron in by_chrom[chrom]:
by_chrom[chrom][intron] += int(count)
by_chrom[chrom][intron] += int(count)
else:
else:
by_chrom[chrom][intron] = int(count)
by_chrom[chrom][intron] = int(count)
Copia
Copiato
Copia
Copiato
for clu in cluExons:
for clu in cluExons:
buf = []
buf = []
ks = cluExons[clu]
ks = cluExons[clu]
ks.sort()
ks.sort()
tot = 0
tot = 0
for exon in ks:
for exon in ks:
chrom, start, end = exon
chrom, start, end = exon
Copia
Copiato
Copia
Copiato
chrom = tuple(chrom.split(":"))
start, end = int(start), int(end)
start, end = int(start), int(end)
if chrom not in by_chrom:
if chrom not in by_chrom:
pass
pass
elif (start,end) in by_chrom[chrom]:
elif (start,end) in by_chrom[chrom]:
tot += by_chrom[chrom][(start,end)]
tot += by_chrom[chrom][(start,end)]
for exon in ks:
for exon in ks:
chrom, start, end = exon
chrom, start, end = exon
start, end = int(start), int(end)
start, end = int(start), int(end)
Copia
Copiato
Copia
Copiato
chrom = tuple(chrom.split(":"))
chromID, strand = chrom
chromID, strand = chrom
if chrom not in by_chrom:
if chrom not in by_chrom:
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot))
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot))
Copia
Copiato
Copia
Copiato
elif (start,end) in by_chrom[chrom]:
elif (start,end) in by_chrom[chrom]:
buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot))
buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot))
else:
else:
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot))
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot))
fout.write("".join(buf))
fout.write("".join(buf))
fout.close()
fout.close()
fout_runlibs.close()
fout_runlibs.close()
def refine_clusters(options):
def refine_clusters(options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
minratio = float(options.mincluratio)
minratio = float(options.mincluratio)
minclureads = int(options.minclureads)
minclureads = int(options.minclureads)
minreads = int(options.minreads)
minreads = int(options.minreads)
inFile = "%s/%s_pooled"%(rundir,outPrefix)
inFile = "%s/%s_pooled"%(rundir,outPrefix)
outFile = "%s/%s_refined"%(rundir,outPrefix)
outFile = "%s/%s_refined"%(rundir,outPrefix)
Copia
Copiato
Copia
Copiato
fout = file(outFile,'w')
fout = file(outFile,'w')
Copia
Copiato
Copia
Copiato
Ncl = 0
Ncl = 0
for ln in open(inFile):
for ln in open(inFile):
clu = []
clu = []
totN = 0
totN = 0
chrom = ln.split()[0]
chrom = ln.split()[0]
for ex in ln.split()[1:]:
for ex in ln.split()[1:]:
A, B, N = ex.split(":")
A, B, N = ex.split(":")
clu.append(((int(A),int(B)), int(N)))
clu.append(((int(A),int(B)), int(N)))
totN += int(N)
totN += int(N)
Copia
Copiato
Copia
Copiato
if totN < minclureads: continue
if totN < minclureads: continue
Copia
Copiato
Copia
Copiato
if options.const:
if len(clu) == 1:
buf = '%s ' % chrom
for interval, count in clu:
buf += "%d:%d" % interval + ":%d"%(count)+ " "
Ncl += 1
fout.write(buf+'\n')
for cl in refine_linked(clu):
for cl in refine_linked(clu):
rc = refine_cluster(cl,minratio, minreads)
rc = refine_cluster(cl,minratio, minreads)
Copia
Copiato
Copia
Copiato
if len(rc) > 0:
if len(rc) > 0:
for clu in rc:
for clu in rc:
buf = '%s ' % chrom
buf = '%s ' % chrom
for interval, count in clu:
for interval, count in clu:
buf += "%d:%d" % interval + ":%d"%(count)+ " "
buf += "%d:%d" % interval + ":%d"%(count)+ " "
Ncl += 1
Ncl += 1
fout.write(buf+'\n')
fout.write(buf+'\n')
Copia
Copiato
Copia
Copiato
sys.stderr.write("Split into %s clusters...\n"%Ncl)
sys.stderr.write("Split into %s clusters...\n"%Ncl)
fout.close()
fout.close()
def merge_junctions(options):
def merge_junctions(options):
''' function to merge junctions '''
''' function to merge junctions '''
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
Copia
Copiato
Copia
Copiato
fnameout = "%s/%s"%(rundir,outPrefix)
fnameout = "%s/%s"%(rundir,outPrefix)
Copia
Copiato
Copia
Copiato
flist = "%s/%s_sortedlibs"%(rundir, outPrefix)
flist = "%s/%s_sortedlibs"%(rundir, outPrefix)
lsts = []
lsts = []
for ln in open(flist):
for ln in open(flist):
lsts.append(ln.strip())
lsts.append(ln.strip())
if options.verbose:
if options.verbose:
sys.stderr.write("merging %d junction files...\n"%(len(lsts)))
sys.stderr.write("merging %d junction files...\n"%(len(lsts)))
# Change 300 if max open file is < 300
# Change 300 if max open file is < 300
N = min([300, max([100, int(len(lsts)**(0.5))])])
N = min([300, max([100, int(len(lsts)**(0.5))])])
tmpfiles = []
tmpfiles = []
while len(lsts) > 1:
while len(lsts) > 1:
clst = []
clst = []
for i in range(0,(len(lsts)/N)+1):
for i in range(0,(len(lsts)/N)+1):
lst = lsts[N*i:N*(i+1)]
lst = lsts[N*i:N*(i+1)]
if len(lst) > 0:
if len(lst) > 0:
clst.append(lst)
clst.append(lst)
lsts = []
lsts = []
for lst in clst:
for lst in clst:
if len(lst) == 0: continue
if len(lst) == 0: continue
tmpfile = tempfile.mktemp()
tmpfile = tempfile.mktemp()
os.mkdir(tmpfile)
os.mkdir(tmpfile)
foutname = tmpfile+"/tmpmerge.gz"
foutname = tmpfile+"/tmpmerge.gz"
fout = gzip.open(foutname,'w')
fout = gzip.open(foutname,'w')
merge_files(lst, fout, options)
merge_files(lst, fout, options)
lsts.append(foutname)
lsts.append(foutname)
tmpfiles.append(foutname)
tmpfiles.append(foutname)
fout.close()
fout.close()
Copia
Copiato
Copia
Copiato
shutil.move(lsts[0], fnameout+"_perind.counts.gz")
if not options.const:
shutil.move(lsts[0], fnameout+"_perind.counts.gz")
else:
shutil.move(lsts[0], fnameout+"_perind.constcounts.gz")
def merge_files(fnames, fout, options):
def merge_files(fnames, fout, options):
fopen = []
fopen = []
for fname in fnames:
for fname in fnames:
if fname[-3:] == ".gz":
if fname[-3:] == ".gz":
fopen.append(gzip.open(fname))
fopen.append(gzip.open(fname))
else:
else:
fopen.append(open(fname))
fopen.append(open(fname))
finished = False
finished = False
N = 0
N = 0
while not finished:
while not finished:
N += 1
N += 1
if N % 50000 == 0:
if N % 50000 == 0:
sys.stderr.write(".")
sys.stderr.write(".")
buf = []
buf = []
for f in fopen:
for f in fopen:
ln = f.readline().split()
ln = f.readline().split()
if len(ln) == 0:
if len(ln) == 0:
finished = True
finished = True
break
break
chrom = ln[0]
chrom = ln[0]
data = ln[1:]
data = ln[1:]
if len(buf) == 0:
if len(buf) == 0:
buf.append(chrom)
buf.append(chrom)
buf += data
buf += data
if len(buf) > 0:
if len(buf) > 0:
if buf[0] == "chrom":
if buf[0] == "chrom":
if options.verbose:
if options.verbose:
sys.stderr.write("merging %d files"%(len(buf)-1))
sys.stderr.write("merging %d files"%(len(buf)-1))
fout.write(" ".join(buf)+'\n')
fout.write(" ".join(buf)+'\n')
else:
else:
break
break
sys.stderr.write(" done.\n")
sys.stderr.write(" done.\n")
for fin in fopen:
for fin in fopen:
fin.close()
fin.close()
def cluster_intervals(E):
def cluster_intervals(E):
''' Clusters intervals together. '''
''' Clusters intervals together. '''
E.sort()
E.sort()
Copia
Copiato
Copia
Copiato
if len(E) == 0:
return [], []
current = E[0]
current = E[0]
Eclusters, cluster = [], []
Eclusters, cluster = [], []
i = 0
i = 0
while i < len(E):
while i < len(E):
if overlaps(E[i], current):
if overlaps(E[i], current):
cluster.append(E[i])
cluster.append(E[i])
else:
else:
Eclusters.append(cluster)
Eclusters.append(cluster)
cluster = [E[i]]
cluster = [E[i]]
current = (E[i][0], max([current[1], E[i][1]]))
current = (E[i][0], max([current[1], E[i][1]]))
i += 1
i += 1
if len(cluster) > 0:
if len(cluster) > 0:
Eclusters.append(cluster)
Eclusters.append(cluster)
return Eclusters, E
return Eclusters, E
def overlaps(A,B):
def overlaps(A,B):
'''
'''
Checks if A and B overlaps
Checks if A and B overlaps
'''
'''
if A[1] < B[0] or B[1] < A[0]:
if A[1] < B[0] or B[1] < A[0]:
return False
return False
else: return True
else: return True
def refine_linked(clusters):
def refine_linked(clusters):
unassigned = [x for x in clusters[1:]]
unassigned = [x for x in clusters[1:]]
current = [clusters[0]]
current = [clusters[0]]
splicesites = set([current[0][0][0],current[0][0][1]])
splicesites = set([current[0][0][0],current[0][0][1]])
newClusters = []
newClusters = []
while len(unassigned) > 0:
while len(unassigned) > 0:
finished = False
finished = False
while not finished:
while not finished:
finished = True
finished = True
torm = []
torm = []
for intron in unassigned:
for intron in unassigned:
inter, count = intron
inter, count = intron
start, end = inter
start, end = inter
if start in splicesites or end in splicesites:
if start in splicesites or end in splicesites:
current.append(intron)
current.append(intron)
splicesites.add(start)
splicesites.add(start)
splicesites.add(end)
splicesites.add(end)
finished = False
finished = False
torm.append(intron)
torm.append(intron)
for intron in torm:
for intron in torm:
unassigned.remove(intron)
unassigned.remove(intron)
newClusters.append(current)
newClusters.append(current)
current = []
current = []
if len(unassigned) > 0:
if len(unassigned) > 0:
current = [unassigned[0]]
current = [unassigned[0]]
splicesites = set([current[0][0][0],current[0][0][1]])
splicesites = set([current[0][0][0],current[0][0][1]])
unassigned = unassigned[1:]
unassigned = unassigned[1:]
return newClusters
return newClusters
def refine_cluster(clu, cutoff, readcutoff):
def refine_cluster(clu, cutoff, readcutoff):
''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff,
''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff,
remove and recluster '''
remove and recluster '''
remove = []
remove = []
dic = {}
dic = {}
intervals = []
intervals = []
reCLU = False
reCLU = False
totN = 0
totN = 0
for inter, count in clu:
for inter, count in clu:
totN += count
totN += count
for inter, count in clu:
for inter, count in clu:
if (count/float(totN) >= cutoff and count >= readcutoff):
if (count/float(totN) >= cutoff and count >= readcutoff):
intervals.append(inter)
intervals.append(inter)
dic[inter] = count
dic[inter] = count
else:
else:
reCLU = True
reCLU = True
if len(intervals) == 0: return []
if len(intervals) == 0: return []
# This makes sure that after trimming, the clusters are still good
# This makes sure that after trimming, the clusters are still good
Atmp, B = cluster_intervals(intervals)
Atmp, B = cluster_intervals(intervals)
A = []
A = []
for cl in Atmp:
for cl in Atmp:
for c in refine_linked([(x,0) for x in cl]):
for c in refine_linked([(x,0) for x in cl]):
if len(c) > 0:
if len(c) > 0:
A.append([x[0] for x in c])
A.append([x[0] for x in c])
if len(A) == 1:
if len(A) == 1:
rc = [(x, dic[x]) for x in A[0]]
rc = [(x, dic[x]) for x in A[0]]
if len(rc) > 1:
if len(rc) > 1:
if reCLU:
if reCLU:
return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff)
return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff)
else:
else:
return [[(x, dic[x]) for x in A[0]]]
return [[(x, dic[x]) for x in A[0]]]
else:
else:
return []
return []
NCs = []
NCs = []
for c in A:
for c in A:
if len(c) > 1:
if len(c) > 1:
NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff)
NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff)
NCs += NC
NCs += NC
Copia
Copiato
Copia
Copiato
return NCs
return NCs
def get_numers(options):
def get_numers(options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
Copia
Copiato
Copia
Copiato
fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix)
if not options.const:
fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix)
else:
fname = "%s/%s_perind.constcounts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.constcounts.gz"%(rundir,outPrefix)
input_file=gzip.open(fname, 'rb')
input_file=gzip.open(fname, 'rb')
fout = gzip.open(fnameout,'w')
fout = gzip.open(fnameout,'w')
first_line=True
first_line=True
for l in input_file:
for l in input_file:
if first_line:
if first_line:
fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names
fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names
first_line=False
first_line=False
else:
else:
l=l.strip()
l=l.strip()
words=l.split(" ")
words=l.split(" ")
fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n')
fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n')
input_file.close()
input_file.close()
fout.close()
fout.close()
if __name__ == "__main__":
if __name__ == "__main__":
from optparse import OptionParser
from optparse import OptionParser
parser = OptionParser()
parser = OptionParser()
parser.add_option("-j", "--juncfiles", dest="juncfiles",
parser.add_option("-j", "--juncfiles", dest="juncfiles",
help="text file with all junction files to be processed")
help="text file with all junction files to be processed")
parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter',
parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter',
help="output prefix (default leafcutter)")
help="output prefix (default leafcutter)")
parser.add_option("-q", "--quiet",
parser.add_option("-q", "--quiet",
action="store_false", dest="verbose", default=True,
action="store_false", dest="verbose", default=True,
help="don't print status messages to stdout")
help="don't print status messages to stdout")
parser.add_option("-r", "--rundir", dest="rundir", default='./',
parser.add_option("-r", "--rundir", dest="rundir", default='./',
help="write to directory (default ./)")
help="write to directory (default ./)")
parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000,
parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000,
help="maximum intron length in bp (default 100,000bp)")
help="maximum intron length in bp (default 100,000bp)")
parser.add_option("-m", "--minclureads", dest="minclureads", default = 30,
parser.add_option("-m", "--minclureads", dest="minclureads", default = 30,
help="minimum reads in a cluster (default 30 reads)")
help="minimum reads in a cluster (default 30 reads)")
parser.add_option("-M", "--minreads", dest="minreads", default = 5,
parser.add_option("-M", "--minreads", dest="minreads", default = 5,
Copia
Copiato
Copia
Copiato
help="minimum reads
for an intron
(default 5 reads)")
help="minimum reads
in a cluster
(default 5 reads)")
parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001,
parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001,
help="minimum fraction of reads in a cluster that support a junction (default 0.001)")
help="minimum fraction of reads in a cluster that support a junction (default 0.001)")
Copia
Copiato
Copia
Copiato
parser.add_option("-k", "--checkchrom", dest="checkchrom", default = False,
help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ... (default=False)")
Copia
Copiato
Copia
Copiato
parser.add_option("-
s
", "--
strand
", dest="
strand",
default = False,
parser.add_option("-
c
", "--
cluster
", dest="
cluster", default = None,
help="
use strand info (
default
=
False
)
")
help="refined cluster file when clusters are already made")
parser.add_option("-k", "--checkchrom", dest="checkchrom", action="store_true",
default = False,
help="
check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ...")
parser.add_option("-C", "--includeconst", dest="const", action="store_true",
default
=
False
,
help="also include constitutive introns
")
Copia
Copiato
Copia
Copiato
parser.add_option("-s", "--strand", dest="strand", default = True,
help="use strand info (default=True)")
(options, args) = parser.parse_args()
(options, args) = parser.parse_args()
if options.juncfiles == None:
if options.juncfiles == None:
sys.stderr.write("Error: no junction file provided...\n")
sys.stderr.write("Error: no junction file provided...\n")
exit(0)
exit(0)
# Get the junction file list
# Get the junction file list
libl = []
libl = []
for junc in open(options.juncfiles):
for junc in open(options.juncfiles):
junc = junc.strip()
junc = junc.strip()
try:
try:
open(junc)
open(junc)
except:
except:
sys.stderr.write("%s does not exist... check your junction files.\n"%junc)
sys.stderr.write("%s does not exist... check your junction files.\n"%junc)
exit(0)
exit(0)
libl.append(junc)
libl.append(junc)
main(options, libl)
main(options, libl)
Diff salvati
Testo originale
Apri file
#!/usr/bin/env python import sys import tempfile import os import gzip import shutil def main(options,libl): pool_junc_reads(libl, options) refine_clusters(options) sort_junctions(libl, options) merge_junctions(options) get_numers(options) def pool_junc_reads(flist, options): outPrefix = options.outprefix rundir = options.rundir maxIntronLen = int(options.maxintronlen) checkchrom = options.checkchrom useStrand = options.strand outFile = "%s/%s_pooled"%(rundir,outPrefix) chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] by_chrom = {} for libl in flist: lib = libl.strip() if not os.path.isfile(lib): continue if options.verbose: sys.stderr.write("scanning %s...\n"%lib) if lib[-3:] == ".gz": F = gzip.open(lib) else: F = open(lib) for ln in F: lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, counts, strand = lnsplit if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue A, B = int(A), int(B)+1 if B-A > int(maxIntronLen): continue try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom, strand)][(A,B)] except: try: by_chrom[(chrom,strand)][(A,B)] = int(counts) except: by_chrom[(chrom, strand)] = {(A,B):int(counts)} fout = file(outFile, 'w') Ncluster = 0 sys.stderr.write("Parsing...\n") for chrom in by_chrom: read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads read_ks.sort() sys.stderr.write("%s:%s.."%chrom) clu = cluster_intervals(read_ks)[0] for cl in clu: if len(cl) > 1: # if cluster has more than one intron buf = '%s:%s '%chrom for interval, count in [(x, by_chrom[chrom][x]) for x in cl]: buf += "%d:%d" % interval + ":%d"%count+ " " fout.write(buf+'\n') Ncluster += 1 sys.stderr.write("\nWrote %d clusters..\n"%Ncluster) fout.close() def sort_junctions(libl, options): chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] outPrefix = options.outprefix rundir = options.rundir refined_cluster = "%s/%s_refined"%(rundir,outPrefix) checkchrom = options.checkchrom useStrand = options.strand runName = "%s/%s"%(rundir, outPrefix) exons, cluExons = {}, {} cluN = 0 for ln in open(refined_cluster): chrom = ln.split()[0] chrom = tuple(chrom.split(":")) cluN += 1 for exon in ln.split()[1:]: A, B, count = exon.split(":") if chrom not in exons: exons[chrom] = {} exons[chrom][(int(A),int(B))] = cluN if cluN not in cluExons: cluExons[cluN] = [] cluExons[cluN].append((chrom, A, B)) merges = {} for ll in libl: lib=ll.rstrip() if not os.path.isfile(lib): continue libN = lib if libN not in merges: merges[libN] = [] merges[libN].append(lib) fout_runlibs = file(runName+"_sortedlibs",'w') for libN in merges: libName = "%s/%s"%(rundir,libN.split('/')[-1]) by_chrom = {} foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1]) fout_runlibs.write(foutName+'\n') if options.verbose: sys.stderr.write("Sorting %s..\n"%libN) if len(merges[libN]) > 1: if options.verbose: sys.stderr.write("merging %s...\n"%(" ".join(merges[libN]))) else: pass fout = gzip.open(foutName,'w') fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0]) for lib in merges[libN]: if lib[-3:] == ".gz": F = gzip.open(lib) else: F = open(lib) for ln in F: lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, start, end, dot, count, strand = ln.split() if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue chrom = (chrom, strand) if chrom not in by_chrom: by_chrom[chrom] = {} intron = (int(start), int(end)+1) if intron in by_chrom[chrom]: by_chrom[chrom][intron] += int(count) else: by_chrom[chrom][intron] = int(count) for clu in cluExons: buf = [] ks = cluExons[clu] ks.sort() tot = 0 for exon in ks: chrom, start, end = exon start, end = int(start), int(end) if chrom not in by_chrom: pass elif (start,end) in by_chrom[chrom]: tot += by_chrom[chrom][(start,end)] for exon in ks: chrom, start, end = exon start, end = int(start), int(end) chromID, strand = chrom if chrom not in by_chrom: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot)) elif (start,end) in by_chrom[chrom]: buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot)) else: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot)) fout.write("".join(buf)) fout.close() fout_runlibs.close() def refine_clusters(options): outPrefix = options.outprefix rundir = options.rundir minratio = float(options.mincluratio) minclureads = int(options.minclureads) minreads = int(options.minreads) inFile = "%s/%s_pooled"%(rundir,outPrefix) outFile = "%s/%s_refined"%(rundir,outPrefix) fout = file(outFile,'w') Ncl = 0 for ln in open(inFile): clu = [] totN = 0 chrom = ln.split()[0] for ex in ln.split()[1:]: A, B, N = ex.split(":") clu.append(((int(A),int(B)), int(N))) totN += int(N) if totN < minclureads: continue for cl in refine_linked(clu): rc = refine_cluster(cl,minratio, minreads) if len(rc) > 0: for clu in rc: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') sys.stderr.write("Split into %s clusters...\n"%Ncl) fout.close() def merge_junctions(options): ''' function to merge junctions ''' outPrefix = options.outprefix rundir = options.rundir fnameout = "%s/%s"%(rundir,outPrefix) flist = "%s/%s_sortedlibs"%(rundir, outPrefix) lsts = [] for ln in open(flist): lsts.append(ln.strip()) if options.verbose: sys.stderr.write("merging %d junction files...\n"%(len(lsts))) # Change 300 if max open file is < 300 N = min([300, max([100, int(len(lsts)**(0.5))])]) tmpfiles = [] while len(lsts) > 1: clst = [] for i in range(0,(len(lsts)/N)+1): lst = lsts[N*i:N*(i+1)] if len(lst) > 0: clst.append(lst) lsts = [] for lst in clst: if len(lst) == 0: continue tmpfile = tempfile.mktemp() os.mkdir(tmpfile) foutname = tmpfile+"/tmpmerge.gz" fout = gzip.open(foutname,'w') merge_files(lst, fout, options) lsts.append(foutname) tmpfiles.append(foutname) fout.close() shutil.move(lsts[0], fnameout+"_perind.counts.gz") def merge_files(fnames, fout, options): fopen = [] for fname in fnames: if fname[-3:] == ".gz": fopen.append(gzip.open(fname)) else: fopen.append(open(fname)) finished = False N = 0 while not finished: N += 1 if N % 50000 == 0: sys.stderr.write(".") buf = [] for f in fopen: ln = f.readline().split() if len(ln) == 0: finished = True break chrom = ln[0] data = ln[1:] if len(buf) == 0: buf.append(chrom) buf += data if len(buf) > 0: if buf[0] == "chrom": if options.verbose: sys.stderr.write("merging %d files"%(len(buf)-1)) fout.write(" ".join(buf)+'\n') else: break sys.stderr.write(" done.\n") for fin in fopen: fin.close() def cluster_intervals(E): ''' Clusters intervals together. ''' E.sort() if len(E) == 0: return [], [] current = E[0] Eclusters, cluster = [], [] i = 0 while i < len(E): if overlaps(E[i], current): cluster.append(E[i]) else: Eclusters.append(cluster) cluster = [E[i]] current = (E[i][0], max([current[1], E[i][1]])) i += 1 if len(cluster) > 0: Eclusters.append(cluster) return Eclusters, E def overlaps(A,B): ''' Checks if A and B overlaps ''' if A[1] < B[0] or B[1] < A[0]: return False else: return True def refine_linked(clusters): unassigned = [x for x in clusters[1:]] current = [clusters[0]] splicesites = set([current[0][0][0],current[0][0][1]]) newClusters = [] while len(unassigned) > 0: finished = False while not finished: finished = True torm = [] for intron in unassigned: inter, count = intron start, end = inter if start in splicesites or end in splicesites: current.append(intron) splicesites.add(start) splicesites.add(end) finished = False torm.append(intron) for intron in torm: unassigned.remove(intron) newClusters.append(current) current = [] if len(unassigned) > 0: current = [unassigned[0]] splicesites = set([current[0][0][0],current[0][0][1]]) unassigned = unassigned[1:] return newClusters def refine_cluster(clu, cutoff, readcutoff): ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff, remove and recluster ''' remove = [] dic = {} intervals = [] reCLU = False totN = 0 for inter, count in clu: totN += count for inter, count in clu: if (count/float(totN) >= cutoff and count >= readcutoff): intervals.append(inter) dic[inter] = count else: reCLU = True if len(intervals) == 0: return [] # This makes sure that after trimming, the clusters are still good Atmp, B = cluster_intervals(intervals) A = [] for cl in Atmp: for c in refine_linked([(x,0) for x in cl]): if len(c) > 0: A.append([x[0] for x in c]) if len(A) == 1: rc = [(x, dic[x]) for x in A[0]] if len(rc) > 1: if reCLU: return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff) else: return [[(x, dic[x]) for x in A[0]]] else: return [] NCs = [] for c in A: if len(c) > 1: NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff) NCs += NC return NCs def get_numers(options): outPrefix = options.outprefix rundir = options.rundir fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix) input_file=gzip.open(fname, 'rb') fout = gzip.open(fnameout,'w') first_line=True for l in input_file: if first_line: fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names first_line=False else: l=l.strip() words=l.split(" ") fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n') input_file.close() fout.close() if __name__ == "__main__": from optparse import OptionParser parser = OptionParser() parser.add_option("-j", "--juncfiles", dest="juncfiles", help="text file with all junction files to be processed") parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter', help="output prefix (default leafcutter)") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") parser.add_option("-r", "--rundir", dest="rundir", default='./', help="write to directory (default ./)") parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, help="maximum intron length in bp (default 100,000bp)") parser.add_option("-m", "--minclureads", dest="minclureads", default = 30, help="minimum reads in a cluster (default 30 reads)") parser.add_option("-M", "--minreads", dest="minreads", default = 5, help="minimum reads for an intron (default 5 reads)") parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001, help="minimum fraction of reads in a cluster that support a junction (default 0.001)") parser.add_option("-k", "--checkchrom", dest="checkchrom", default = False, help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ... (default=False)") parser.add_option("-s", "--strand", dest="strand", default = False, help="use strand info (default=False)") (options, args) = parser.parse_args() if options.juncfiles == None: sys.stderr.write("Error: no junction file provided...\n") exit(0) # Get the junction file list libl = [] for junc in open(options.juncfiles): junc = junc.strip() try: open(junc) except: sys.stderr.write("%s does not exist... check your junction files.\n"%junc) exit(0) libl.append(junc) main(options, libl)
Testo modificato
Apri file
# https://github.com/griffithlab/regtools # /home/yangili1/tools/regtools/build/regtools junctions extract -a 8 -i 50 -I 500000 bamfile.bam -o outfile.junc # Using regtools speeds up the junction extraction step by an order of magnitude or more import sys import tempfile import os import gzip import shutil def main(options,libl): if options.cluster == None: pool_junc_reads(libl, options) refine_clusters(options) sort_junctions(libl, options) merge_junctions(options) get_numers(options) def pool_junc_reads(flist, options): outPrefix = options.outprefix rundir = options.rundir maxIntronLen = int(options.maxintronlen) checkchrom = options.checkchrom useStrand = options.strand outFile = "%s/%s_pooled"%(rundir,outPrefix) chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] by_chrom = {} for libl in flist: lib = libl.strip() if not os.path.isfile(lib): continue if options.verbose: sys.stderr.write("scanning %s...\n"%lib) for ln in open(lib): lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, counts, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit if int(blockCount) > 2: print ln continue if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue Aoff, Boff = blockSize.split(",") A, B = int(A)+int(Aoff), int(B)-int(Boff)+1 if B-A > int(maxIntronLen): continue try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,strand)][(A,B)] except: try: by_chrom[(chrom,strand)][(A,B)] = int(counts) except: by_chrom[(chrom,strand)] = {(A,B):int(counts)} fout = file(outFile, 'w') Ncluster = 0 sys.stderr.write("Parsing...\n") for chrom in by_chrom: read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads read_ks.sort() sys.stderr.write("%s:%s.."%chrom) clu = cluster_intervals(read_ks)[0] for cl in clu: if len(cl) > 0: # 1 if cluster has more than one intron buf = '%s:%s '%chrom for interval, count in [(x, by_chrom[chrom][x]) for x in cl]: buf += "%d:%d" % interval + ":%d"%count+ " " fout.write(buf+'\n') Ncluster += 1 sys.stderr.write("\nWrote %d clusters..\n"%Ncluster) fout.close() def sort_junctions(libl, options): chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] outPrefix = options.outprefix rundir = options.rundir checkchrom = options.checkchrom useStrand = options.strand if options.cluster == None: refined_cluster = "%s/%s_refined"%(rundir,outPrefix) else: refined_cluster = options.cluster runName = "%s/%s"%(rundir, outPrefix) exons, cluExons = {}, {} cluN = 0 for ln in open(refined_cluster): chrom = ln.split()[0] cluN += 1 for exon in ln.split()[1:]: A, B, count = exon.split(":") if chrom not in exons: exons[chrom] = {} exons[chrom][(int(A),int(B))] = cluN if cluN not in cluExons: cluExons[cluN] = [] cluExons[cluN].append((chrom, A, B)) merges = {} for ll in libl: lib=ll.rstrip() if not os.path.isfile(lib): continue libN = lib if libN not in merges: merges[libN] = [] merges[libN].append(lib) fout_runlibs = file(runName+"_sortedlibs",'w') for libN in merges: libName = "%s/%s"%(rundir,libN.split('/')[-1]) by_chrom = {} foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1]) fout_runlibs.write(foutName+'\n') if options.verbose: sys.stderr.write("Sorting %s..\n"%libN) if len(merges[libN]) > 1: if options.verbose: sys.stderr.write("merging %s...\n"%(" ".join(merges[libN]))) else: pass fout = gzip.open(foutName,'w') fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0]) for lib in merges[libN]: for ln in open(lib): lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, count, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit if int(blockCount) > 2: print ln continue if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue Aoff, Boff = blockSize.split(",") A, B = int(A)+int(Aoff), int(B)-int(Boff)+1 chrom = (chrom,strand) if chrom not in by_chrom: by_chrom[chrom] = {} intron = (A, B) if intron in by_chrom[chrom]: by_chrom[chrom][intron] += int(count) else: by_chrom[chrom][intron] = int(count) for clu in cluExons: buf = [] ks = cluExons[clu] ks.sort() tot = 0 for exon in ks: chrom, start, end = exon chrom = tuple(chrom.split(":")) start, end = int(start), int(end) if chrom not in by_chrom: pass elif (start,end) in by_chrom[chrom]: tot += by_chrom[chrom][(start,end)] for exon in ks: chrom, start, end = exon start, end = int(start), int(end) chrom = tuple(chrom.split(":")) chromID, strand = chrom if chrom not in by_chrom: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot)) elif (start,end) in by_chrom[chrom]: buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot)) else: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot)) fout.write("".join(buf)) fout.close() fout_runlibs.close() def refine_clusters(options): outPrefix = options.outprefix rundir = options.rundir minratio = float(options.mincluratio) minclureads = int(options.minclureads) minreads = int(options.minreads) inFile = "%s/%s_pooled"%(rundir,outPrefix) outFile = "%s/%s_refined"%(rundir,outPrefix) fout = file(outFile,'w') Ncl = 0 for ln in open(inFile): clu = [] totN = 0 chrom = ln.split()[0] for ex in ln.split()[1:]: A, B, N = ex.split(":") clu.append(((int(A),int(B)), int(N))) totN += int(N) if totN < minclureads: continue if options.const: if len(clu) == 1: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') for cl in refine_linked(clu): rc = refine_cluster(cl,minratio, minreads) if len(rc) > 0: for clu in rc: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') sys.stderr.write("Split into %s clusters...\n"%Ncl) fout.close() def merge_junctions(options): ''' function to merge junctions ''' outPrefix = options.outprefix rundir = options.rundir fnameout = "%s/%s"%(rundir,outPrefix) flist = "%s/%s_sortedlibs"%(rundir, outPrefix) lsts = [] for ln in open(flist): lsts.append(ln.strip()) if options.verbose: sys.stderr.write("merging %d junction files...\n"%(len(lsts))) # Change 300 if max open file is < 300 N = min([300, max([100, int(len(lsts)**(0.5))])]) tmpfiles = [] while len(lsts) > 1: clst = [] for i in range(0,(len(lsts)/N)+1): lst = lsts[N*i:N*(i+1)] if len(lst) > 0: clst.append(lst) lsts = [] for lst in clst: if len(lst) == 0: continue tmpfile = tempfile.mktemp() os.mkdir(tmpfile) foutname = tmpfile+"/tmpmerge.gz" fout = gzip.open(foutname,'w') merge_files(lst, fout, options) lsts.append(foutname) tmpfiles.append(foutname) fout.close() if not options.const: shutil.move(lsts[0], fnameout+"_perind.counts.gz") else: shutil.move(lsts[0], fnameout+"_perind.constcounts.gz") def merge_files(fnames, fout, options): fopen = [] for fname in fnames: if fname[-3:] == ".gz": fopen.append(gzip.open(fname)) else: fopen.append(open(fname)) finished = False N = 0 while not finished: N += 1 if N % 50000 == 0: sys.stderr.write(".") buf = [] for f in fopen: ln = f.readline().split() if len(ln) == 0: finished = True break chrom = ln[0] data = ln[1:] if len(buf) == 0: buf.append(chrom) buf += data if len(buf) > 0: if buf[0] == "chrom": if options.verbose: sys.stderr.write("merging %d files"%(len(buf)-1)) fout.write(" ".join(buf)+'\n') else: break sys.stderr.write(" done.\n") for fin in fopen: fin.close() def cluster_intervals(E): ''' Clusters intervals together. ''' E.sort() current = E[0] Eclusters, cluster = [], [] i = 0 while i < len(E): if overlaps(E[i], current): cluster.append(E[i]) else: Eclusters.append(cluster) cluster = [E[i]] current = (E[i][0], max([current[1], E[i][1]])) i += 1 if len(cluster) > 0: Eclusters.append(cluster) return Eclusters, E def overlaps(A,B): ''' Checks if A and B overlaps ''' if A[1] < B[0] or B[1] < A[0]: return False else: return True def refine_linked(clusters): unassigned = [x for x in clusters[1:]] current = [clusters[0]] splicesites = set([current[0][0][0],current[0][0][1]]) newClusters = [] while len(unassigned) > 0: finished = False while not finished: finished = True torm = [] for intron in unassigned: inter, count = intron start, end = inter if start in splicesites or end in splicesites: current.append(intron) splicesites.add(start) splicesites.add(end) finished = False torm.append(intron) for intron in torm: unassigned.remove(intron) newClusters.append(current) current = [] if len(unassigned) > 0: current = [unassigned[0]] splicesites = set([current[0][0][0],current[0][0][1]]) unassigned = unassigned[1:] return newClusters def refine_cluster(clu, cutoff, readcutoff): ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff, remove and recluster ''' remove = [] dic = {} intervals = [] reCLU = False totN = 0 for inter, count in clu: totN += count for inter, count in clu: if (count/float(totN) >= cutoff and count >= readcutoff): intervals.append(inter) dic[inter] = count else: reCLU = True if len(intervals) == 0: return [] # This makes sure that after trimming, the clusters are still good Atmp, B = cluster_intervals(intervals) A = [] for cl in Atmp: for c in refine_linked([(x,0) for x in cl]): if len(c) > 0: A.append([x[0] for x in c]) if len(A) == 1: rc = [(x, dic[x]) for x in A[0]] if len(rc) > 1: if reCLU: return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff) else: return [[(x, dic[x]) for x in A[0]]] else: return [] NCs = [] for c in A: if len(c) > 1: NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff) NCs += NC return NCs def get_numers(options): outPrefix = options.outprefix rundir = options.rundir if not options.const: fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix) else: fname = "%s/%s_perind.constcounts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.constcounts.gz"%(rundir,outPrefix) input_file=gzip.open(fname, 'rb') fout = gzip.open(fnameout,'w') first_line=True for l in input_file: if first_line: fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names first_line=False else: l=l.strip() words=l.split(" ") fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n') input_file.close() fout.close() if __name__ == "__main__": from optparse import OptionParser parser = OptionParser() parser.add_option("-j", "--juncfiles", dest="juncfiles", help="text file with all junction files to be processed") parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter', help="output prefix (default leafcutter)") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") parser.add_option("-r", "--rundir", dest="rundir", default='./', help="write to directory (default ./)") parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, help="maximum intron length in bp (default 100,000bp)") parser.add_option("-m", "--minclureads", dest="minclureads", default = 30, help="minimum reads in a cluster (default 30 reads)") parser.add_option("-M", "--minreads", dest="minreads", default = 5, help="minimum reads in a cluster (default 5 reads)") parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001, help="minimum fraction of reads in a cluster that support a junction (default 0.001)") parser.add_option("-c", "--cluster", dest="cluster", default = None, help="refined cluster file when clusters are already made") parser.add_option("-k", "--checkchrom", dest="checkchrom", action="store_true",default = False, help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ...") parser.add_option("-C", "--includeconst", dest="const", action="store_true",default = False, help="also include constitutive introns") parser.add_option("-s", "--strand", dest="strand", default = True, help="use strand info (default=True)") (options, args) = parser.parse_args() if options.juncfiles == None: sys.stderr.write("Error: no junction file provided...\n") exit(0) # Get the junction file list libl = [] for junc in open(options.juncfiles): junc = junc.strip() try: open(junc) except: sys.stderr.write("%s does not exist... check your junction files.\n"%junc) exit(0) libl.append(junc) main(options, libl)
Trovare la differenza