Diff
checker
Texte
Texte
Images
Documents
Excel
Dossiers
Legal
Enterprise
Application de bureau
Prix
Se connecter
Télécharger Diffchecker Desktop
Comparer le texte
Trouver la différence entre deux fichiers texte
Outils
Historique
Éditeur live
Cacher identiques
Sans retour à la ligne
Vue
Divisé
Unifié
Niveau de précision
Intelligent
Mot
Caractère
Coloration syntaxique
Choisir la syntaxe
Ignorer
Transformer le texte
Aller au premier écart
Modifier l'entrée
Diffchecker Desktop
La façon la plus sécurisée d'utiliser Diffchecker. Obtenez l'application Diffchecker Desktop : vos diffs ne quittent jamais votre ordinateur !
Obtenir Desktop
leafcutter_cluster.py VS leafcutter_cluster_regtools.py
Créé
il y a 6 ans
Le diff n'expire jamais
Effacer
Exporter
Partager
Expliquer
36 suppressions
Lignes
Total
Supprimé
Caractères
Total
Supprimé
Pour continuer à utiliser cette fonctionnalité, passez à
Diff
checker
Pro
Voir les prix
489 lignes
Copier tout
94 ajouts
Lignes
Total
Ajouté
Caractères
Total
Ajouté
Pour continuer à utiliser cette fonctionnalité, passez à
Diff
checker
Pro
Voir les prix
534 lignes
Copier tout
Copier
Copié
Copier
Copié
#!/usr/bin/env python
# https://github.com/griffithlab/regtools
# /home/yangili1/tools/regtools/build/regtools junctions extract -a 8 -i 50 -I 500000 bamfile.bam -o outfile.junc
# Using regtools speeds up the junction extraction step by an order of magnitude or more
import sys
import sys
import tempfile
import tempfile
import os
import os
import gzip
import gzip
import shutil
import shutil
def main(options,libl):
def main(options,libl):
Copier
Copié
Copier
Copié
pool_junc_reads(libl, options)
if options.cluster == None:
refine_clusters(options)
pool_junc_reads(libl, options)
refine_clusters(options)
sort_junctions(libl, options)
sort_junctions(libl, options)
merge_junctions(options)
merge_junctions(options)
get_numers(options)
get_numers(options)
def pool_junc_reads(flist, options):
def pool_junc_reads(flist, options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
maxIntronLen = int(options.maxintronlen)
maxIntronLen = int(options.maxintronlen)
checkchrom = options.checkchrom
checkchrom = options.checkchrom
useStrand = options.strand
useStrand = options.strand
outFile = "%s/%s_pooled"%(rundir,outPrefix)
outFile = "%s/%s_pooled"%(rundir,outPrefix)
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
by_chrom = {}
by_chrom = {}
for libl in flist:
for libl in flist:
lib = libl.strip()
lib = libl.strip()
if not os.path.isfile(lib):
if not os.path.isfile(lib):
continue
continue
if options.verbose:
if options.verbose:
sys.stderr.write("scanning %s...\n"%lib)
sys.stderr.write("scanning %s...\n"%lib)
Copier
Copié
Copier
Copié
if lib[-3:] == ".gz": F = gzip.open(lib)
for ln in
open(lib)
:
else: F = open(lib)
for ln in
F
:
lnsplit=ln.split()
lnsplit=ln.split()
if len(lnsplit)<6:
if len(lnsplit)<6:
sys.stderr.write("Error in %s \n" % lib)
sys.stderr.write("Error in %s \n" % lib)
continue
continue
Copier
Copié
Copier
Copié
chrom, A, B, dot, counts, strand
= lnsplit
chrom, A, B, dot, counts, strand
, rA,rb, rgb, blockCount, blockSize, blockStarts
= lnsplit
if int(blockCount) > 2:
print ln
continue
if not useStrand:
if not useStrand:
Copier
Copié
Copier
Copié
strand = "NA"
strand = "NA"
if checkchrom and (chrom not in chromLst): continue
if checkchrom and (chrom not in chromLst): continue
Copier
Copié
Copier
Copié
A, B = int(A)
, int(B)
+1
Aoff, Boff = blockSize.split(",")
A, B = int(A)
+int(Aoff)
, int(B)
-int(Boff)+1
if B-A > int(maxIntronLen): continue
if B-A > int(maxIntronLen): continue
Copier
Copié
Copier
Copié
try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,
strand)][(A,B)]
try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,
strand)][(A,B)]
except:
except:
try: by_chrom[(chrom,strand)][(A,B)] = int(counts)
try: by_chrom[(chrom,strand)][(A,B)] = int(counts)
Copier
Copié
Copier
Copié
except: by_chrom[(chrom,
strand)] = {(A,B):int(counts)}
except: by_chrom[(chrom,
strand)] = {(A,B):int(counts)}
fout = file(outFile, 'w')
fout = file(outFile, 'w')
Ncluster = 0
Ncluster = 0
sys.stderr.write("Parsing...\n")
sys.stderr.write("Parsing...\n")
for chrom in by_chrom:
for chrom in by_chrom:
read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads
read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads
read_ks.sort()
read_ks.sort()
Copier
Copié
Copier
Copié
sys.stderr.write("%s:%s.."%chrom)
sys.stderr.write("%s:%s.."%chrom)
clu = cluster_intervals(read_ks)[0]
clu = cluster_intervals(read_ks)[0]
for cl in clu:
for cl in clu:
Copier
Copié
Copier
Copié
if len(cl) >
1
: #
if cluster has more than one intron
if len(cl) >
0
: #
1
if cluster has more than one intron
buf = '%s:%s '%chrom
buf = '%s:%s '%chrom
for interval, count in [(x, by_chrom[chrom][x]) for x in cl]:
for interval, count in [(x, by_chrom[chrom][x]) for x in cl]:
buf += "%d:%d" % interval + ":%d"%count+ " "
buf += "%d:%d" % interval + ":%d"%count+ " "
fout.write(buf+'\n')
fout.write(buf+'\n')
Copier
Copié
Copier
Copié
Ncluster += 1
Ncluster += 1
sys.stderr.write("\nWrote %d clusters..\n"%Ncluster)
sys.stderr.write("\nWrote %d clusters..\n"%Ncluster)
fout.close()
fout.close()
def sort_junctions(libl, options):
def sort_junctions(libl, options):
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
Copier
Copié
Copier
Copié
refined_cluster = "%s/%s_refined"%(rundir,outPrefix)
checkchrom = options.checkchrom
checkchrom = options.checkchrom
useStrand = options.strand
useStrand = options.strand
Copier
Copié
Copier
Copié
if options.cluster == None:
refined_cluster = "%s/%s_refined"%(rundir,outPrefix)
else:
refined_cluster = options.cluster
runName = "%s/%s"%(rundir, outPrefix)
runName = "%s/%s"%(rundir, outPrefix)
exons, cluExons = {}, {}
exons, cluExons = {}, {}
cluN = 0
cluN = 0
for ln in open(refined_cluster):
for ln in open(refined_cluster):
chrom = ln.split()[0]
chrom = ln.split()[0]
Copier
Copié
Copier
Copié
chrom = tuple(chrom.split(":"))
cluN += 1
cluN += 1
for exon in ln.split()[1:]:
for exon in ln.split()[1:]:
A, B, count = exon.split(":")
A, B, count = exon.split(":")
if chrom not in exons:
if chrom not in exons:
exons[chrom] = {}
exons[chrom] = {}
Copier
Copié
Copier
Copié
exons[chrom][(int(A),int(B))] = cluN
exons[chrom][(int(A),int(B))] = cluN
if cluN not in cluExons:
if cluN not in cluExons:
cluExons[cluN] = []
cluExons[cluN] = []
cluExons[cluN].append((chrom, A, B))
cluExons[cluN].append((chrom, A, B))
merges = {}
merges = {}
for ll in libl:
for ll in libl:
lib=ll.rstrip()
lib=ll.rstrip()
if not os.path.isfile(lib):
if not os.path.isfile(lib):
continue
continue
libN = lib
libN = lib
if libN not in merges:
if libN not in merges:
merges[libN] = []
merges[libN] = []
merges[libN].append(lib)
merges[libN].append(lib)
fout_runlibs = file(runName+"_sortedlibs",'w')
fout_runlibs = file(runName+"_sortedlibs",'w')
for libN in merges:
for libN in merges:
libName = "%s/%s"%(rundir,libN.split('/')[-1])
libName = "%s/%s"%(rundir,libN.split('/')[-1])
by_chrom = {}
by_chrom = {}
foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1])
foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1])
fout_runlibs.write(foutName+'\n')
fout_runlibs.write(foutName+'\n')
if options.verbose:
if options.verbose:
sys.stderr.write("Sorting %s..\n"%libN)
sys.stderr.write("Sorting %s..\n"%libN)
if len(merges[libN]) > 1:
if len(merges[libN]) > 1:
if options.verbose:
if options.verbose:
sys.stderr.write("merging %s...\n"%(" ".join(merges[libN])))
sys.stderr.write("merging %s...\n"%(" ".join(merges[libN])))
else:
else:
pass
pass
fout = gzip.open(foutName,'w')
fout = gzip.open(foutName,'w')
fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0])
fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0])
for lib in merges[libN]:
for lib in merges[libN]:
Copier
Copié
Copier
Copié
if lib[-3:] == ".gz": F = gzip.open(lib)
else: F = open(lib)
for ln in
open(lib)
:
for ln in
F
:
lnsplit=ln.split()
lnsplit=ln.split()
if len(lnsplit)<6:
if len(lnsplit)<6:
sys.stderr.write("Error in %s \n" % lib)
sys.stderr.write("Error in %s \n" % lib)
continue
continue
Copier
Copié
Copier
Copié
chrom,
start, end
, dot, count, strand
= ln
.
split
()
chrom,
A, B
, dot, count, strand
, rA,rb, rgb, blockCount, blockSize, blockStarts
= ln
split
if int(blockCount) > 2:
print ln
continue
if not useStrand:
if not useStrand:
Copier
Copié
Copier
Copié
strand = "NA"
strand = "NA"
if checkchrom and (chrom not in chromLst): continue
if checkchrom and (chrom not in chromLst): continue
Copier
Copié
Copier
Copié
chrom = (chrom,
strand)
Aoff, Boff = blockSize.split(",")
A, B = int(A)+int(Aoff), int(B)-int(Boff)+1
chrom = (chrom,
strand)
if chrom not in by_chrom:
if chrom not in by_chrom:
by_chrom[chrom] = {}
by_chrom[chrom] = {}
Copier
Copié
Copier
Copié
intron = (
int(start), int(end)+1)
intron = (
A, B)
if intron in by_chrom[chrom]:
if intron in by_chrom[chrom]:
by_chrom[chrom][intron] += int(count)
by_chrom[chrom][intron] += int(count)
else:
else:
by_chrom[chrom][intron] = int(count)
by_chrom[chrom][intron] = int(count)
Copier
Copié
Copier
Copié
for clu in cluExons:
for clu in cluExons:
buf = []
buf = []
ks = cluExons[clu]
ks = cluExons[clu]
ks.sort()
ks.sort()
tot = 0
tot = 0
for exon in ks:
for exon in ks:
chrom, start, end = exon
chrom, start, end = exon
Copier
Copié
Copier
Copié
chrom = tuple(chrom.split(":"))
start, end = int(start), int(end)
start, end = int(start), int(end)
if chrom not in by_chrom:
if chrom not in by_chrom:
pass
pass
elif (start,end) in by_chrom[chrom]:
elif (start,end) in by_chrom[chrom]:
tot += by_chrom[chrom][(start,end)]
tot += by_chrom[chrom][(start,end)]
for exon in ks:
for exon in ks:
chrom, start, end = exon
chrom, start, end = exon
start, end = int(start), int(end)
start, end = int(start), int(end)
Copier
Copié
Copier
Copié
chrom = tuple(chrom.split(":"))
chromID, strand = chrom
chromID, strand = chrom
if chrom not in by_chrom:
if chrom not in by_chrom:
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot))
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot))
Copier
Copié
Copier
Copié
elif (start,end) in by_chrom[chrom]:
elif (start,end) in by_chrom[chrom]:
buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot))
buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot))
else:
else:
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot))
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot))
fout.write("".join(buf))
fout.write("".join(buf))
fout.close()
fout.close()
fout_runlibs.close()
fout_runlibs.close()
def refine_clusters(options):
def refine_clusters(options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
minratio = float(options.mincluratio)
minratio = float(options.mincluratio)
minclureads = int(options.minclureads)
minclureads = int(options.minclureads)
minreads = int(options.minreads)
minreads = int(options.minreads)
inFile = "%s/%s_pooled"%(rundir,outPrefix)
inFile = "%s/%s_pooled"%(rundir,outPrefix)
outFile = "%s/%s_refined"%(rundir,outPrefix)
outFile = "%s/%s_refined"%(rundir,outPrefix)
Copier
Copié
Copier
Copié
fout = file(outFile,'w')
fout = file(outFile,'w')
Copier
Copié
Copier
Copié
Ncl = 0
Ncl = 0
for ln in open(inFile):
for ln in open(inFile):
clu = []
clu = []
totN = 0
totN = 0
chrom = ln.split()[0]
chrom = ln.split()[0]
for ex in ln.split()[1:]:
for ex in ln.split()[1:]:
A, B, N = ex.split(":")
A, B, N = ex.split(":")
clu.append(((int(A),int(B)), int(N)))
clu.append(((int(A),int(B)), int(N)))
totN += int(N)
totN += int(N)
Copier
Copié
Copier
Copié
if totN < minclureads: continue
if totN < minclureads: continue
Copier
Copié
Copier
Copié
if options.const:
if len(clu) == 1:
buf = '%s ' % chrom
for interval, count in clu:
buf += "%d:%d" % interval + ":%d"%(count)+ " "
Ncl += 1
fout.write(buf+'\n')
for cl in refine_linked(clu):
for cl in refine_linked(clu):
rc = refine_cluster(cl,minratio, minreads)
rc = refine_cluster(cl,minratio, minreads)
Copier
Copié
Copier
Copié
if len(rc) > 0:
if len(rc) > 0:
for clu in rc:
for clu in rc:
buf = '%s ' % chrom
buf = '%s ' % chrom
for interval, count in clu:
for interval, count in clu:
buf += "%d:%d" % interval + ":%d"%(count)+ " "
buf += "%d:%d" % interval + ":%d"%(count)+ " "
Ncl += 1
Ncl += 1
fout.write(buf+'\n')
fout.write(buf+'\n')
Copier
Copié
Copier
Copié
sys.stderr.write("Split into %s clusters...\n"%Ncl)
sys.stderr.write("Split into %s clusters...\n"%Ncl)
fout.close()
fout.close()
def merge_junctions(options):
def merge_junctions(options):
''' function to merge junctions '''
''' function to merge junctions '''
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
Copier
Copié
Copier
Copié
fnameout = "%s/%s"%(rundir,outPrefix)
fnameout = "%s/%s"%(rundir,outPrefix)
Copier
Copié
Copier
Copié
flist = "%s/%s_sortedlibs"%(rundir, outPrefix)
flist = "%s/%s_sortedlibs"%(rundir, outPrefix)
lsts = []
lsts = []
for ln in open(flist):
for ln in open(flist):
lsts.append(ln.strip())
lsts.append(ln.strip())
if options.verbose:
if options.verbose:
sys.stderr.write("merging %d junction files...\n"%(len(lsts)))
sys.stderr.write("merging %d junction files...\n"%(len(lsts)))
# Change 300 if max open file is < 300
# Change 300 if max open file is < 300
N = min([300, max([100, int(len(lsts)**(0.5))])])
N = min([300, max([100, int(len(lsts)**(0.5))])])
tmpfiles = []
tmpfiles = []
while len(lsts) > 1:
while len(lsts) > 1:
clst = []
clst = []
for i in range(0,(len(lsts)/N)+1):
for i in range(0,(len(lsts)/N)+1):
lst = lsts[N*i:N*(i+1)]
lst = lsts[N*i:N*(i+1)]
if len(lst) > 0:
if len(lst) > 0:
clst.append(lst)
clst.append(lst)
lsts = []
lsts = []
for lst in clst:
for lst in clst:
if len(lst) == 0: continue
if len(lst) == 0: continue
tmpfile = tempfile.mktemp()
tmpfile = tempfile.mktemp()
os.mkdir(tmpfile)
os.mkdir(tmpfile)
foutname = tmpfile+"/tmpmerge.gz"
foutname = tmpfile+"/tmpmerge.gz"
fout = gzip.open(foutname,'w')
fout = gzip.open(foutname,'w')
merge_files(lst, fout, options)
merge_files(lst, fout, options)
lsts.append(foutname)
lsts.append(foutname)
tmpfiles.append(foutname)
tmpfiles.append(foutname)
fout.close()
fout.close()
Copier
Copié
Copier
Copié
shutil.move(lsts[0], fnameout+"_perind.counts.gz")
if not options.const:
shutil.move(lsts[0], fnameout+"_perind.counts.gz")
else:
shutil.move(lsts[0], fnameout+"_perind.constcounts.gz")
def merge_files(fnames, fout, options):
def merge_files(fnames, fout, options):
fopen = []
fopen = []
for fname in fnames:
for fname in fnames:
if fname[-3:] == ".gz":
if fname[-3:] == ".gz":
fopen.append(gzip.open(fname))
fopen.append(gzip.open(fname))
else:
else:
fopen.append(open(fname))
fopen.append(open(fname))
finished = False
finished = False
N = 0
N = 0
while not finished:
while not finished:
N += 1
N += 1
if N % 50000 == 0:
if N % 50000 == 0:
sys.stderr.write(".")
sys.stderr.write(".")
buf = []
buf = []
for f in fopen:
for f in fopen:
ln = f.readline().split()
ln = f.readline().split()
if len(ln) == 0:
if len(ln) == 0:
finished = True
finished = True
break
break
chrom = ln[0]
chrom = ln[0]
data = ln[1:]
data = ln[1:]
if len(buf) == 0:
if len(buf) == 0:
buf.append(chrom)
buf.append(chrom)
buf += data
buf += data
if len(buf) > 0:
if len(buf) > 0:
if buf[0] == "chrom":
if buf[0] == "chrom":
if options.verbose:
if options.verbose:
sys.stderr.write("merging %d files"%(len(buf)-1))
sys.stderr.write("merging %d files"%(len(buf)-1))
fout.write(" ".join(buf)+'\n')
fout.write(" ".join(buf)+'\n')
else:
else:
break
break
sys.stderr.write(" done.\n")
sys.stderr.write(" done.\n")
for fin in fopen:
for fin in fopen:
fin.close()
fin.close()
def cluster_intervals(E):
def cluster_intervals(E):
''' Clusters intervals together. '''
''' Clusters intervals together. '''
E.sort()
E.sort()
Copier
Copié
Copier
Copié
if len(E) == 0:
return [], []
current = E[0]
current = E[0]
Eclusters, cluster = [], []
Eclusters, cluster = [], []
i = 0
i = 0
while i < len(E):
while i < len(E):
if overlaps(E[i], current):
if overlaps(E[i], current):
cluster.append(E[i])
cluster.append(E[i])
else:
else:
Eclusters.append(cluster)
Eclusters.append(cluster)
cluster = [E[i]]
cluster = [E[i]]
current = (E[i][0], max([current[1], E[i][1]]))
current = (E[i][0], max([current[1], E[i][1]]))
i += 1
i += 1
if len(cluster) > 0:
if len(cluster) > 0:
Eclusters.append(cluster)
Eclusters.append(cluster)
return Eclusters, E
return Eclusters, E
def overlaps(A,B):
def overlaps(A,B):
'''
'''
Checks if A and B overlaps
Checks if A and B overlaps
'''
'''
if A[1] < B[0] or B[1] < A[0]:
if A[1] < B[0] or B[1] < A[0]:
return False
return False
else: return True
else: return True
def refine_linked(clusters):
def refine_linked(clusters):
unassigned = [x for x in clusters[1:]]
unassigned = [x for x in clusters[1:]]
current = [clusters[0]]
current = [clusters[0]]
splicesites = set([current[0][0][0],current[0][0][1]])
splicesites = set([current[0][0][0],current[0][0][1]])
newClusters = []
newClusters = []
while len(unassigned) > 0:
while len(unassigned) > 0:
finished = False
finished = False
while not finished:
while not finished:
finished = True
finished = True
torm = []
torm = []
for intron in unassigned:
for intron in unassigned:
inter, count = intron
inter, count = intron
start, end = inter
start, end = inter
if start in splicesites or end in splicesites:
if start in splicesites or end in splicesites:
current.append(intron)
current.append(intron)
splicesites.add(start)
splicesites.add(start)
splicesites.add(end)
splicesites.add(end)
finished = False
finished = False
torm.append(intron)
torm.append(intron)
for intron in torm:
for intron in torm:
unassigned.remove(intron)
unassigned.remove(intron)
newClusters.append(current)
newClusters.append(current)
current = []
current = []
if len(unassigned) > 0:
if len(unassigned) > 0:
current = [unassigned[0]]
current = [unassigned[0]]
splicesites = set([current[0][0][0],current[0][0][1]])
splicesites = set([current[0][0][0],current[0][0][1]])
unassigned = unassigned[1:]
unassigned = unassigned[1:]
return newClusters
return newClusters
def refine_cluster(clu, cutoff, readcutoff):
def refine_cluster(clu, cutoff, readcutoff):
''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff,
''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff,
remove and recluster '''
remove and recluster '''
remove = []
remove = []
dic = {}
dic = {}
intervals = []
intervals = []
reCLU = False
reCLU = False
totN = 0
totN = 0
for inter, count in clu:
for inter, count in clu:
totN += count
totN += count
for inter, count in clu:
for inter, count in clu:
if (count/float(totN) >= cutoff and count >= readcutoff):
if (count/float(totN) >= cutoff and count >= readcutoff):
intervals.append(inter)
intervals.append(inter)
dic[inter] = count
dic[inter] = count
else:
else:
reCLU = True
reCLU = True
if len(intervals) == 0: return []
if len(intervals) == 0: return []
# This makes sure that after trimming, the clusters are still good
# This makes sure that after trimming, the clusters are still good
Atmp, B = cluster_intervals(intervals)
Atmp, B = cluster_intervals(intervals)
A = []
A = []
for cl in Atmp:
for cl in Atmp:
for c in refine_linked([(x,0) for x in cl]):
for c in refine_linked([(x,0) for x in cl]):
if len(c) > 0:
if len(c) > 0:
A.append([x[0] for x in c])
A.append([x[0] for x in c])
if len(A) == 1:
if len(A) == 1:
rc = [(x, dic[x]) for x in A[0]]
rc = [(x, dic[x]) for x in A[0]]
if len(rc) > 1:
if len(rc) > 1:
if reCLU:
if reCLU:
return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff)
return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff)
else:
else:
return [[(x, dic[x]) for x in A[0]]]
return [[(x, dic[x]) for x in A[0]]]
else:
else:
return []
return []
NCs = []
NCs = []
for c in A:
for c in A:
if len(c) > 1:
if len(c) > 1:
NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff)
NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff)
NCs += NC
NCs += NC
Copier
Copié
Copier
Copié
return NCs
return NCs
def get_numers(options):
def get_numers(options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
Copier
Copié
Copier
Copié
fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix)
if not options.const:
fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix)
else:
fname = "%s/%s_perind.constcounts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.constcounts.gz"%(rundir,outPrefix)
input_file=gzip.open(fname, 'rb')
input_file=gzip.open(fname, 'rb')
fout = gzip.open(fnameout,'w')
fout = gzip.open(fnameout,'w')
first_line=True
first_line=True
for l in input_file:
for l in input_file:
if first_line:
if first_line:
fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names
fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names
first_line=False
first_line=False
else:
else:
l=l.strip()
l=l.strip()
words=l.split(" ")
words=l.split(" ")
fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n')
fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n')
input_file.close()
input_file.close()
fout.close()
fout.close()
if __name__ == "__main__":
if __name__ == "__main__":
from optparse import OptionParser
from optparse import OptionParser
parser = OptionParser()
parser = OptionParser()
parser.add_option("-j", "--juncfiles", dest="juncfiles",
parser.add_option("-j", "--juncfiles", dest="juncfiles",
help="text file with all junction files to be processed")
help="text file with all junction files to be processed")
parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter',
parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter',
help="output prefix (default leafcutter)")
help="output prefix (default leafcutter)")
parser.add_option("-q", "--quiet",
parser.add_option("-q", "--quiet",
action="store_false", dest="verbose", default=True,
action="store_false", dest="verbose", default=True,
help="don't print status messages to stdout")
help="don't print status messages to stdout")
parser.add_option("-r", "--rundir", dest="rundir", default='./',
parser.add_option("-r", "--rundir", dest="rundir", default='./',
help="write to directory (default ./)")
help="write to directory (default ./)")
parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000,
parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000,
help="maximum intron length in bp (default 100,000bp)")
help="maximum intron length in bp (default 100,000bp)")
parser.add_option("-m", "--minclureads", dest="minclureads", default = 30,
parser.add_option("-m", "--minclureads", dest="minclureads", default = 30,
help="minimum reads in a cluster (default 30 reads)")
help="minimum reads in a cluster (default 30 reads)")
parser.add_option("-M", "--minreads", dest="minreads", default = 5,
parser.add_option("-M", "--minreads", dest="minreads", default = 5,
Copier
Copié
Copier
Copié
help="minimum reads
for an intron
(default 5 reads)")
help="minimum reads
in a cluster
(default 5 reads)")
parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001,
parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001,
help="minimum fraction of reads in a cluster that support a junction (default 0.001)")
help="minimum fraction of reads in a cluster that support a junction (default 0.001)")
Copier
Copié
Copier
Copié
parser.add_option("-k", "--checkchrom", dest="checkchrom", default = False,
help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ... (default=False)")
Copier
Copié
Copier
Copié
parser.add_option("-
s
", "--
strand
", dest="
strand",
default = False,
parser.add_option("-
c
", "--
cluster
", dest="
cluster", default = None,
help="
use strand info (
default
=
False
)
")
help="refined cluster file when clusters are already made")
parser.add_option("-k", "--checkchrom", dest="checkchrom", action="store_true",
default = False,
help="
check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ...")
parser.add_option("-C", "--includeconst", dest="const", action="store_true",
default
=
False
,
help="also include constitutive introns
")
Copier
Copié
Copier
Copié
parser.add_option("-s", "--strand", dest="strand", default = True,
help="use strand info (default=True)")
(options, args) = parser.parse_args()
(options, args) = parser.parse_args()
if options.juncfiles == None:
if options.juncfiles == None:
sys.stderr.write("Error: no junction file provided...\n")
sys.stderr.write("Error: no junction file provided...\n")
exit(0)
exit(0)
# Get the junction file list
# Get the junction file list
libl = []
libl = []
for junc in open(options.juncfiles):
for junc in open(options.juncfiles):
junc = junc.strip()
junc = junc.strip()
try:
try:
open(junc)
open(junc)
except:
except:
sys.stderr.write("%s does not exist... check your junction files.\n"%junc)
sys.stderr.write("%s does not exist... check your junction files.\n"%junc)
exit(0)
exit(0)
libl.append(junc)
libl.append(junc)
main(options, libl)
main(options, libl)
Différences enregistrées
Texte d'origine
Ouvrir un fichier
#!/usr/bin/env python import sys import tempfile import os import gzip import shutil def main(options,libl): pool_junc_reads(libl, options) refine_clusters(options) sort_junctions(libl, options) merge_junctions(options) get_numers(options) def pool_junc_reads(flist, options): outPrefix = options.outprefix rundir = options.rundir maxIntronLen = int(options.maxintronlen) checkchrom = options.checkchrom useStrand = options.strand outFile = "%s/%s_pooled"%(rundir,outPrefix) chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] by_chrom = {} for libl in flist: lib = libl.strip() if not os.path.isfile(lib): continue if options.verbose: sys.stderr.write("scanning %s...\n"%lib) if lib[-3:] == ".gz": F = gzip.open(lib) else: F = open(lib) for ln in F: lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, counts, strand = lnsplit if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue A, B = int(A), int(B)+1 if B-A > int(maxIntronLen): continue try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom, strand)][(A,B)] except: try: by_chrom[(chrom,strand)][(A,B)] = int(counts) except: by_chrom[(chrom, strand)] = {(A,B):int(counts)} fout = file(outFile, 'w') Ncluster = 0 sys.stderr.write("Parsing...\n") for chrom in by_chrom: read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads read_ks.sort() sys.stderr.write("%s:%s.."%chrom) clu = cluster_intervals(read_ks)[0] for cl in clu: if len(cl) > 1: # if cluster has more than one intron buf = '%s:%s '%chrom for interval, count in [(x, by_chrom[chrom][x]) for x in cl]: buf += "%d:%d" % interval + ":%d"%count+ " " fout.write(buf+'\n') Ncluster += 1 sys.stderr.write("\nWrote %d clusters..\n"%Ncluster) fout.close() def sort_junctions(libl, options): chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] outPrefix = options.outprefix rundir = options.rundir refined_cluster = "%s/%s_refined"%(rundir,outPrefix) checkchrom = options.checkchrom useStrand = options.strand runName = "%s/%s"%(rundir, outPrefix) exons, cluExons = {}, {} cluN = 0 for ln in open(refined_cluster): chrom = ln.split()[0] chrom = tuple(chrom.split(":")) cluN += 1 for exon in ln.split()[1:]: A, B, count = exon.split(":") if chrom not in exons: exons[chrom] = {} exons[chrom][(int(A),int(B))] = cluN if cluN not in cluExons: cluExons[cluN] = [] cluExons[cluN].append((chrom, A, B)) merges = {} for ll in libl: lib=ll.rstrip() if not os.path.isfile(lib): continue libN = lib if libN not in merges: merges[libN] = [] merges[libN].append(lib) fout_runlibs = file(runName+"_sortedlibs",'w') for libN in merges: libName = "%s/%s"%(rundir,libN.split('/')[-1]) by_chrom = {} foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1]) fout_runlibs.write(foutName+'\n') if options.verbose: sys.stderr.write("Sorting %s..\n"%libN) if len(merges[libN]) > 1: if options.verbose: sys.stderr.write("merging %s...\n"%(" ".join(merges[libN]))) else: pass fout = gzip.open(foutName,'w') fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0]) for lib in merges[libN]: if lib[-3:] == ".gz": F = gzip.open(lib) else: F = open(lib) for ln in F: lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, start, end, dot, count, strand = ln.split() if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue chrom = (chrom, strand) if chrom not in by_chrom: by_chrom[chrom] = {} intron = (int(start), int(end)+1) if intron in by_chrom[chrom]: by_chrom[chrom][intron] += int(count) else: by_chrom[chrom][intron] = int(count) for clu in cluExons: buf = [] ks = cluExons[clu] ks.sort() tot = 0 for exon in ks: chrom, start, end = exon start, end = int(start), int(end) if chrom not in by_chrom: pass elif (start,end) in by_chrom[chrom]: tot += by_chrom[chrom][(start,end)] for exon in ks: chrom, start, end = exon start, end = int(start), int(end) chromID, strand = chrom if chrom not in by_chrom: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot)) elif (start,end) in by_chrom[chrom]: buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot)) else: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot)) fout.write("".join(buf)) fout.close() fout_runlibs.close() def refine_clusters(options): outPrefix = options.outprefix rundir = options.rundir minratio = float(options.mincluratio) minclureads = int(options.minclureads) minreads = int(options.minreads) inFile = "%s/%s_pooled"%(rundir,outPrefix) outFile = "%s/%s_refined"%(rundir,outPrefix) fout = file(outFile,'w') Ncl = 0 for ln in open(inFile): clu = [] totN = 0 chrom = ln.split()[0] for ex in ln.split()[1:]: A, B, N = ex.split(":") clu.append(((int(A),int(B)), int(N))) totN += int(N) if totN < minclureads: continue for cl in refine_linked(clu): rc = refine_cluster(cl,minratio, minreads) if len(rc) > 0: for clu in rc: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') sys.stderr.write("Split into %s clusters...\n"%Ncl) fout.close() def merge_junctions(options): ''' function to merge junctions ''' outPrefix = options.outprefix rundir = options.rundir fnameout = "%s/%s"%(rundir,outPrefix) flist = "%s/%s_sortedlibs"%(rundir, outPrefix) lsts = [] for ln in open(flist): lsts.append(ln.strip()) if options.verbose: sys.stderr.write("merging %d junction files...\n"%(len(lsts))) # Change 300 if max open file is < 300 N = min([300, max([100, int(len(lsts)**(0.5))])]) tmpfiles = [] while len(lsts) > 1: clst = [] for i in range(0,(len(lsts)/N)+1): lst = lsts[N*i:N*(i+1)] if len(lst) > 0: clst.append(lst) lsts = [] for lst in clst: if len(lst) == 0: continue tmpfile = tempfile.mktemp() os.mkdir(tmpfile) foutname = tmpfile+"/tmpmerge.gz" fout = gzip.open(foutname,'w') merge_files(lst, fout, options) lsts.append(foutname) tmpfiles.append(foutname) fout.close() shutil.move(lsts[0], fnameout+"_perind.counts.gz") def merge_files(fnames, fout, options): fopen = [] for fname in fnames: if fname[-3:] == ".gz": fopen.append(gzip.open(fname)) else: fopen.append(open(fname)) finished = False N = 0 while not finished: N += 1 if N % 50000 == 0: sys.stderr.write(".") buf = [] for f in fopen: ln = f.readline().split() if len(ln) == 0: finished = True break chrom = ln[0] data = ln[1:] if len(buf) == 0: buf.append(chrom) buf += data if len(buf) > 0: if buf[0] == "chrom": if options.verbose: sys.stderr.write("merging %d files"%(len(buf)-1)) fout.write(" ".join(buf)+'\n') else: break sys.stderr.write(" done.\n") for fin in fopen: fin.close() def cluster_intervals(E): ''' Clusters intervals together. ''' E.sort() if len(E) == 0: return [], [] current = E[0] Eclusters, cluster = [], [] i = 0 while i < len(E): if overlaps(E[i], current): cluster.append(E[i]) else: Eclusters.append(cluster) cluster = [E[i]] current = (E[i][0], max([current[1], E[i][1]])) i += 1 if len(cluster) > 0: Eclusters.append(cluster) return Eclusters, E def overlaps(A,B): ''' Checks if A and B overlaps ''' if A[1] < B[0] or B[1] < A[0]: return False else: return True def refine_linked(clusters): unassigned = [x for x in clusters[1:]] current = [clusters[0]] splicesites = set([current[0][0][0],current[0][0][1]]) newClusters = [] while len(unassigned) > 0: finished = False while not finished: finished = True torm = [] for intron in unassigned: inter, count = intron start, end = inter if start in splicesites or end in splicesites: current.append(intron) splicesites.add(start) splicesites.add(end) finished = False torm.append(intron) for intron in torm: unassigned.remove(intron) newClusters.append(current) current = [] if len(unassigned) > 0: current = [unassigned[0]] splicesites = set([current[0][0][0],current[0][0][1]]) unassigned = unassigned[1:] return newClusters def refine_cluster(clu, cutoff, readcutoff): ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff, remove and recluster ''' remove = [] dic = {} intervals = [] reCLU = False totN = 0 for inter, count in clu: totN += count for inter, count in clu: if (count/float(totN) >= cutoff and count >= readcutoff): intervals.append(inter) dic[inter] = count else: reCLU = True if len(intervals) == 0: return [] # This makes sure that after trimming, the clusters are still good Atmp, B = cluster_intervals(intervals) A = [] for cl in Atmp: for c in refine_linked([(x,0) for x in cl]): if len(c) > 0: A.append([x[0] for x in c]) if len(A) == 1: rc = [(x, dic[x]) for x in A[0]] if len(rc) > 1: if reCLU: return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff) else: return [[(x, dic[x]) for x in A[0]]] else: return [] NCs = [] for c in A: if len(c) > 1: NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff) NCs += NC return NCs def get_numers(options): outPrefix = options.outprefix rundir = options.rundir fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix) input_file=gzip.open(fname, 'rb') fout = gzip.open(fnameout,'w') first_line=True for l in input_file: if first_line: fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names first_line=False else: l=l.strip() words=l.split(" ") fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n') input_file.close() fout.close() if __name__ == "__main__": from optparse import OptionParser parser = OptionParser() parser.add_option("-j", "--juncfiles", dest="juncfiles", help="text file with all junction files to be processed") parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter', help="output prefix (default leafcutter)") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") parser.add_option("-r", "--rundir", dest="rundir", default='./', help="write to directory (default ./)") parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, help="maximum intron length in bp (default 100,000bp)") parser.add_option("-m", "--minclureads", dest="minclureads", default = 30, help="minimum reads in a cluster (default 30 reads)") parser.add_option("-M", "--minreads", dest="minreads", default = 5, help="minimum reads for an intron (default 5 reads)") parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001, help="minimum fraction of reads in a cluster that support a junction (default 0.001)") parser.add_option("-k", "--checkchrom", dest="checkchrom", default = False, help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ... (default=False)") parser.add_option("-s", "--strand", dest="strand", default = False, help="use strand info (default=False)") (options, args) = parser.parse_args() if options.juncfiles == None: sys.stderr.write("Error: no junction file provided...\n") exit(0) # Get the junction file list libl = [] for junc in open(options.juncfiles): junc = junc.strip() try: open(junc) except: sys.stderr.write("%s does not exist... check your junction files.\n"%junc) exit(0) libl.append(junc) main(options, libl)
Texte modifié
Ouvrir un fichier
# https://github.com/griffithlab/regtools # /home/yangili1/tools/regtools/build/regtools junctions extract -a 8 -i 50 -I 500000 bamfile.bam -o outfile.junc # Using regtools speeds up the junction extraction step by an order of magnitude or more import sys import tempfile import os import gzip import shutil def main(options,libl): if options.cluster == None: pool_junc_reads(libl, options) refine_clusters(options) sort_junctions(libl, options) merge_junctions(options) get_numers(options) def pool_junc_reads(flist, options): outPrefix = options.outprefix rundir = options.rundir maxIntronLen = int(options.maxintronlen) checkchrom = options.checkchrom useStrand = options.strand outFile = "%s/%s_pooled"%(rundir,outPrefix) chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] by_chrom = {} for libl in flist: lib = libl.strip() if not os.path.isfile(lib): continue if options.verbose: sys.stderr.write("scanning %s...\n"%lib) for ln in open(lib): lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, counts, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit if int(blockCount) > 2: print ln continue if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue Aoff, Boff = blockSize.split(",") A, B = int(A)+int(Aoff), int(B)-int(Boff)+1 if B-A > int(maxIntronLen): continue try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,strand)][(A,B)] except: try: by_chrom[(chrom,strand)][(A,B)] = int(counts) except: by_chrom[(chrom,strand)] = {(A,B):int(counts)} fout = file(outFile, 'w') Ncluster = 0 sys.stderr.write("Parsing...\n") for chrom in by_chrom: read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads read_ks.sort() sys.stderr.write("%s:%s.."%chrom) clu = cluster_intervals(read_ks)[0] for cl in clu: if len(cl) > 0: # 1 if cluster has more than one intron buf = '%s:%s '%chrom for interval, count in [(x, by_chrom[chrom][x]) for x in cl]: buf += "%d:%d" % interval + ":%d"%count+ " " fout.write(buf+'\n') Ncluster += 1 sys.stderr.write("\nWrote %d clusters..\n"%Ncluster) fout.close() def sort_junctions(libl, options): chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] outPrefix = options.outprefix rundir = options.rundir checkchrom = options.checkchrom useStrand = options.strand if options.cluster == None: refined_cluster = "%s/%s_refined"%(rundir,outPrefix) else: refined_cluster = options.cluster runName = "%s/%s"%(rundir, outPrefix) exons, cluExons = {}, {} cluN = 0 for ln in open(refined_cluster): chrom = ln.split()[0] cluN += 1 for exon in ln.split()[1:]: A, B, count = exon.split(":") if chrom not in exons: exons[chrom] = {} exons[chrom][(int(A),int(B))] = cluN if cluN not in cluExons: cluExons[cluN] = [] cluExons[cluN].append((chrom, A, B)) merges = {} for ll in libl: lib=ll.rstrip() if not os.path.isfile(lib): continue libN = lib if libN not in merges: merges[libN] = [] merges[libN].append(lib) fout_runlibs = file(runName+"_sortedlibs",'w') for libN in merges: libName = "%s/%s"%(rundir,libN.split('/')[-1]) by_chrom = {} foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1]) fout_runlibs.write(foutName+'\n') if options.verbose: sys.stderr.write("Sorting %s..\n"%libN) if len(merges[libN]) > 1: if options.verbose: sys.stderr.write("merging %s...\n"%(" ".join(merges[libN]))) else: pass fout = gzip.open(foutName,'w') fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0]) for lib in merges[libN]: for ln in open(lib): lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, count, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit if int(blockCount) > 2: print ln continue if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue Aoff, Boff = blockSize.split(",") A, B = int(A)+int(Aoff), int(B)-int(Boff)+1 chrom = (chrom,strand) if chrom not in by_chrom: by_chrom[chrom] = {} intron = (A, B) if intron in by_chrom[chrom]: by_chrom[chrom][intron] += int(count) else: by_chrom[chrom][intron] = int(count) for clu in cluExons: buf = [] ks = cluExons[clu] ks.sort() tot = 0 for exon in ks: chrom, start, end = exon chrom = tuple(chrom.split(":")) start, end = int(start), int(end) if chrom not in by_chrom: pass elif (start,end) in by_chrom[chrom]: tot += by_chrom[chrom][(start,end)] for exon in ks: chrom, start, end = exon start, end = int(start), int(end) chrom = tuple(chrom.split(":")) chromID, strand = chrom if chrom not in by_chrom: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot)) elif (start,end) in by_chrom[chrom]: buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot)) else: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot)) fout.write("".join(buf)) fout.close() fout_runlibs.close() def refine_clusters(options): outPrefix = options.outprefix rundir = options.rundir minratio = float(options.mincluratio) minclureads = int(options.minclureads) minreads = int(options.minreads) inFile = "%s/%s_pooled"%(rundir,outPrefix) outFile = "%s/%s_refined"%(rundir,outPrefix) fout = file(outFile,'w') Ncl = 0 for ln in open(inFile): clu = [] totN = 0 chrom = ln.split()[0] for ex in ln.split()[1:]: A, B, N = ex.split(":") clu.append(((int(A),int(B)), int(N))) totN += int(N) if totN < minclureads: continue if options.const: if len(clu) == 1: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') for cl in refine_linked(clu): rc = refine_cluster(cl,minratio, minreads) if len(rc) > 0: for clu in rc: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') sys.stderr.write("Split into %s clusters...\n"%Ncl) fout.close() def merge_junctions(options): ''' function to merge junctions ''' outPrefix = options.outprefix rundir = options.rundir fnameout = "%s/%s"%(rundir,outPrefix) flist = "%s/%s_sortedlibs"%(rundir, outPrefix) lsts = [] for ln in open(flist): lsts.append(ln.strip()) if options.verbose: sys.stderr.write("merging %d junction files...\n"%(len(lsts))) # Change 300 if max open file is < 300 N = min([300, max([100, int(len(lsts)**(0.5))])]) tmpfiles = [] while len(lsts) > 1: clst = [] for i in range(0,(len(lsts)/N)+1): lst = lsts[N*i:N*(i+1)] if len(lst) > 0: clst.append(lst) lsts = [] for lst in clst: if len(lst) == 0: continue tmpfile = tempfile.mktemp() os.mkdir(tmpfile) foutname = tmpfile+"/tmpmerge.gz" fout = gzip.open(foutname,'w') merge_files(lst, fout, options) lsts.append(foutname) tmpfiles.append(foutname) fout.close() if not options.const: shutil.move(lsts[0], fnameout+"_perind.counts.gz") else: shutil.move(lsts[0], fnameout+"_perind.constcounts.gz") def merge_files(fnames, fout, options): fopen = [] for fname in fnames: if fname[-3:] == ".gz": fopen.append(gzip.open(fname)) else: fopen.append(open(fname)) finished = False N = 0 while not finished: N += 1 if N % 50000 == 0: sys.stderr.write(".") buf = [] for f in fopen: ln = f.readline().split() if len(ln) == 0: finished = True break chrom = ln[0] data = ln[1:] if len(buf) == 0: buf.append(chrom) buf += data if len(buf) > 0: if buf[0] == "chrom": if options.verbose: sys.stderr.write("merging %d files"%(len(buf)-1)) fout.write(" ".join(buf)+'\n') else: break sys.stderr.write(" done.\n") for fin in fopen: fin.close() def cluster_intervals(E): ''' Clusters intervals together. ''' E.sort() current = E[0] Eclusters, cluster = [], [] i = 0 while i < len(E): if overlaps(E[i], current): cluster.append(E[i]) else: Eclusters.append(cluster) cluster = [E[i]] current = (E[i][0], max([current[1], E[i][1]])) i += 1 if len(cluster) > 0: Eclusters.append(cluster) return Eclusters, E def overlaps(A,B): ''' Checks if A and B overlaps ''' if A[1] < B[0] or B[1] < A[0]: return False else: return True def refine_linked(clusters): unassigned = [x for x in clusters[1:]] current = [clusters[0]] splicesites = set([current[0][0][0],current[0][0][1]]) newClusters = [] while len(unassigned) > 0: finished = False while not finished: finished = True torm = [] for intron in unassigned: inter, count = intron start, end = inter if start in splicesites or end in splicesites: current.append(intron) splicesites.add(start) splicesites.add(end) finished = False torm.append(intron) for intron in torm: unassigned.remove(intron) newClusters.append(current) current = [] if len(unassigned) > 0: current = [unassigned[0]] splicesites = set([current[0][0][0],current[0][0][1]]) unassigned = unassigned[1:] return newClusters def refine_cluster(clu, cutoff, readcutoff): ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff, remove and recluster ''' remove = [] dic = {} intervals = [] reCLU = False totN = 0 for inter, count in clu: totN += count for inter, count in clu: if (count/float(totN) >= cutoff and count >= readcutoff): intervals.append(inter) dic[inter] = count else: reCLU = True if len(intervals) == 0: return [] # This makes sure that after trimming, the clusters are still good Atmp, B = cluster_intervals(intervals) A = [] for cl in Atmp: for c in refine_linked([(x,0) for x in cl]): if len(c) > 0: A.append([x[0] for x in c]) if len(A) == 1: rc = [(x, dic[x]) for x in A[0]] if len(rc) > 1: if reCLU: return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff) else: return [[(x, dic[x]) for x in A[0]]] else: return [] NCs = [] for c in A: if len(c) > 1: NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff) NCs += NC return NCs def get_numers(options): outPrefix = options.outprefix rundir = options.rundir if not options.const: fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix) else: fname = "%s/%s_perind.constcounts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.constcounts.gz"%(rundir,outPrefix) input_file=gzip.open(fname, 'rb') fout = gzip.open(fnameout,'w') first_line=True for l in input_file: if first_line: fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names first_line=False else: l=l.strip() words=l.split(" ") fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n') input_file.close() fout.close() if __name__ == "__main__": from optparse import OptionParser parser = OptionParser() parser.add_option("-j", "--juncfiles", dest="juncfiles", help="text file with all junction files to be processed") parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter', help="output prefix (default leafcutter)") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") parser.add_option("-r", "--rundir", dest="rundir", default='./', help="write to directory (default ./)") parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, help="maximum intron length in bp (default 100,000bp)") parser.add_option("-m", "--minclureads", dest="minclureads", default = 30, help="minimum reads in a cluster (default 30 reads)") parser.add_option("-M", "--minreads", dest="minreads", default = 5, help="minimum reads in a cluster (default 5 reads)") parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001, help="minimum fraction of reads in a cluster that support a junction (default 0.001)") parser.add_option("-c", "--cluster", dest="cluster", default = None, help="refined cluster file when clusters are already made") parser.add_option("-k", "--checkchrom", dest="checkchrom", action="store_true",default = False, help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ...") parser.add_option("-C", "--includeconst", dest="const", action="store_true",default = False, help="also include constitutive introns") parser.add_option("-s", "--strand", dest="strand", default = True, help="use strand info (default=True)") (options, args) = parser.parse_args() if options.juncfiles == None: sys.stderr.write("Error: no junction file provided...\n") exit(0) # Get the junction file list libl = [] for junc in open(options.juncfiles): junc = junc.strip() try: open(junc) except: sys.stderr.write("%s does not exist... check your junction files.\n"%junc) exit(0) libl.append(junc) main(options, libl)
Trouver la différence