Diff
checker
Texto
Texto
Imagens
Documentos
Excel
Pastas
Legal
Enterprise
Aplicativo para desktop
Preços
Fazer login
Baixar o Diffchecker Desktop
Comparar texto
Encontre a diferença entre dois arquivos de texto
Ferramentas
Histórico
Editor live
Recolher inalteradas
Sem quebra de linha
Layout
Dividido
Unificado
Nível de detalhe
Inteligente
Palavra
Caractere
Realce de sintaxe
Escolher sintaxe
Ignorar
Transformar texto
Ir à primeira mudança
Editar entrada
Diffchecker Desktop
A maneira mais segura de usar o Diffchecker. Obtenha o aplicativo Diffchecker Desktop: seus diffs nunca saem do seu computador!
Obter Desktop
leafcutter_cluster.py VS leafcutter_cluster_regtools.py
Criado
há 6 anos
O diff nunca expira
Limpar
Exportar
Compartilhar
Explicar
36 remoções
Linhas
Total
Removido
Caracteres
Total
Removido
Para continuar usando este recurso, atualize para
Diff
checker
Pro
Ver preços
489 linhas
Copiar tudo
94 adições
Linhas
Total
Adicionado
Caracteres
Total
Adicionado
Para continuar usando este recurso, atualize para
Diff
checker
Pro
Ver preços
534 linhas
Copiar tudo
Copiar
Copiado
Copiar
Copiado
#!/usr/bin/env python
# https://github.com/griffithlab/regtools
# /home/yangili1/tools/regtools/build/regtools junctions extract -a 8 -i 50 -I 500000 bamfile.bam -o outfile.junc
# Using regtools speeds up the junction extraction step by an order of magnitude or more
import sys
import sys
import tempfile
import tempfile
import os
import os
import gzip
import gzip
import shutil
import shutil
def main(options,libl):
def main(options,libl):
Copiar
Copiado
Copiar
Copiado
pool_junc_reads(libl, options)
if options.cluster == None:
refine_clusters(options)
pool_junc_reads(libl, options)
refine_clusters(options)
sort_junctions(libl, options)
sort_junctions(libl, options)
merge_junctions(options)
merge_junctions(options)
get_numers(options)
get_numers(options)
def pool_junc_reads(flist, options):
def pool_junc_reads(flist, options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
maxIntronLen = int(options.maxintronlen)
maxIntronLen = int(options.maxintronlen)
checkchrom = options.checkchrom
checkchrom = options.checkchrom
useStrand = options.strand
useStrand = options.strand
outFile = "%s/%s_pooled"%(rundir,outPrefix)
outFile = "%s/%s_pooled"%(rundir,outPrefix)
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
by_chrom = {}
by_chrom = {}
for libl in flist:
for libl in flist:
lib = libl.strip()
lib = libl.strip()
if not os.path.isfile(lib):
if not os.path.isfile(lib):
continue
continue
if options.verbose:
if options.verbose:
sys.stderr.write("scanning %s...\n"%lib)
sys.stderr.write("scanning %s...\n"%lib)
Copiar
Copiado
Copiar
Copiado
if lib[-3:] == ".gz": F = gzip.open(lib)
for ln in
open(lib)
:
else: F = open(lib)
for ln in
F
:
lnsplit=ln.split()
lnsplit=ln.split()
if len(lnsplit)<6:
if len(lnsplit)<6:
sys.stderr.write("Error in %s \n" % lib)
sys.stderr.write("Error in %s \n" % lib)
continue
continue
Copiar
Copiado
Copiar
Copiado
chrom, A, B, dot, counts, strand
= lnsplit
chrom, A, B, dot, counts, strand
, rA,rb, rgb, blockCount, blockSize, blockStarts
= lnsplit
if int(blockCount) > 2:
print ln
continue
if not useStrand:
if not useStrand:
Copiar
Copiado
Copiar
Copiado
strand = "NA"
strand = "NA"
if checkchrom and (chrom not in chromLst): continue
if checkchrom and (chrom not in chromLst): continue
Copiar
Copiado
Copiar
Copiado
A, B = int(A)
, int(B)
+1
Aoff, Boff = blockSize.split(",")
A, B = int(A)
+int(Aoff)
, int(B)
-int(Boff)+1
if B-A > int(maxIntronLen): continue
if B-A > int(maxIntronLen): continue
Copiar
Copiado
Copiar
Copiado
try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,
strand)][(A,B)]
try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,
strand)][(A,B)]
except:
except:
try: by_chrom[(chrom,strand)][(A,B)] = int(counts)
try: by_chrom[(chrom,strand)][(A,B)] = int(counts)
Copiar
Copiado
Copiar
Copiado
except: by_chrom[(chrom,
strand)] = {(A,B):int(counts)}
except: by_chrom[(chrom,
strand)] = {(A,B):int(counts)}
fout = file(outFile, 'w')
fout = file(outFile, 'w')
Ncluster = 0
Ncluster = 0
sys.stderr.write("Parsing...\n")
sys.stderr.write("Parsing...\n")
for chrom in by_chrom:
for chrom in by_chrom:
read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads
read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads
read_ks.sort()
read_ks.sort()
Copiar
Copiado
Copiar
Copiado
sys.stderr.write("%s:%s.."%chrom)
sys.stderr.write("%s:%s.."%chrom)
clu = cluster_intervals(read_ks)[0]
clu = cluster_intervals(read_ks)[0]
for cl in clu:
for cl in clu:
Copiar
Copiado
Copiar
Copiado
if len(cl) >
1
: #
if cluster has more than one intron
if len(cl) >
0
: #
1
if cluster has more than one intron
buf = '%s:%s '%chrom
buf = '%s:%s '%chrom
for interval, count in [(x, by_chrom[chrom][x]) for x in cl]:
for interval, count in [(x, by_chrom[chrom][x]) for x in cl]:
buf += "%d:%d" % interval + ":%d"%count+ " "
buf += "%d:%d" % interval + ":%d"%count+ " "
fout.write(buf+'\n')
fout.write(buf+'\n')
Copiar
Copiado
Copiar
Copiado
Ncluster += 1
Ncluster += 1
sys.stderr.write("\nWrote %d clusters..\n"%Ncluster)
sys.stderr.write("\nWrote %d clusters..\n"%Ncluster)
fout.close()
fout.close()
def sort_junctions(libl, options):
def sort_junctions(libl, options):
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y']
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
Copiar
Copiado
Copiar
Copiado
refined_cluster = "%s/%s_refined"%(rundir,outPrefix)
checkchrom = options.checkchrom
checkchrom = options.checkchrom
useStrand = options.strand
useStrand = options.strand
Copiar
Copiado
Copiar
Copiado
if options.cluster == None:
refined_cluster = "%s/%s_refined"%(rundir,outPrefix)
else:
refined_cluster = options.cluster
runName = "%s/%s"%(rundir, outPrefix)
runName = "%s/%s"%(rundir, outPrefix)
exons, cluExons = {}, {}
exons, cluExons = {}, {}
cluN = 0
cluN = 0
for ln in open(refined_cluster):
for ln in open(refined_cluster):
chrom = ln.split()[0]
chrom = ln.split()[0]
Copiar
Copiado
Copiar
Copiado
chrom = tuple(chrom.split(":"))
cluN += 1
cluN += 1
for exon in ln.split()[1:]:
for exon in ln.split()[1:]:
A, B, count = exon.split(":")
A, B, count = exon.split(":")
if chrom not in exons:
if chrom not in exons:
exons[chrom] = {}
exons[chrom] = {}
Copiar
Copiado
Copiar
Copiado
exons[chrom][(int(A),int(B))] = cluN
exons[chrom][(int(A),int(B))] = cluN
if cluN not in cluExons:
if cluN not in cluExons:
cluExons[cluN] = []
cluExons[cluN] = []
cluExons[cluN].append((chrom, A, B))
cluExons[cluN].append((chrom, A, B))
merges = {}
merges = {}
for ll in libl:
for ll in libl:
lib=ll.rstrip()
lib=ll.rstrip()
if not os.path.isfile(lib):
if not os.path.isfile(lib):
continue
continue
libN = lib
libN = lib
if libN not in merges:
if libN not in merges:
merges[libN] = []
merges[libN] = []
merges[libN].append(lib)
merges[libN].append(lib)
fout_runlibs = file(runName+"_sortedlibs",'w')
fout_runlibs = file(runName+"_sortedlibs",'w')
for libN in merges:
for libN in merges:
libName = "%s/%s"%(rundir,libN.split('/')[-1])
libName = "%s/%s"%(rundir,libN.split('/')[-1])
by_chrom = {}
by_chrom = {}
foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1])
foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1])
fout_runlibs.write(foutName+'\n')
fout_runlibs.write(foutName+'\n')
if options.verbose:
if options.verbose:
sys.stderr.write("Sorting %s..\n"%libN)
sys.stderr.write("Sorting %s..\n"%libN)
if len(merges[libN]) > 1:
if len(merges[libN]) > 1:
if options.verbose:
if options.verbose:
sys.stderr.write("merging %s...\n"%(" ".join(merges[libN])))
sys.stderr.write("merging %s...\n"%(" ".join(merges[libN])))
else:
else:
pass
pass
fout = gzip.open(foutName,'w')
fout = gzip.open(foutName,'w')
fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0])
fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0])
for lib in merges[libN]:
for lib in merges[libN]:
Copiar
Copiado
Copiar
Copiado
if lib[-3:] == ".gz": F = gzip.open(lib)
else: F = open(lib)
for ln in
open(lib)
:
for ln in
F
:
lnsplit=ln.split()
lnsplit=ln.split()
if len(lnsplit)<6:
if len(lnsplit)<6:
sys.stderr.write("Error in %s \n" % lib)
sys.stderr.write("Error in %s \n" % lib)
continue
continue
Copiar
Copiado
Copiar
Copiado
chrom,
start, end
, dot, count, strand
= ln
.
split
()
chrom,
A, B
, dot, count, strand
, rA,rb, rgb, blockCount, blockSize, blockStarts
= ln
split
if int(blockCount) > 2:
print ln
continue
if not useStrand:
if not useStrand:
Copiar
Copiado
Copiar
Copiado
strand = "NA"
strand = "NA"
if checkchrom and (chrom not in chromLst): continue
if checkchrom and (chrom not in chromLst): continue
Copiar
Copiado
Copiar
Copiado
chrom = (chrom,
strand)
Aoff, Boff = blockSize.split(",")
A, B = int(A)+int(Aoff), int(B)-int(Boff)+1
chrom = (chrom,
strand)
if chrom not in by_chrom:
if chrom not in by_chrom:
by_chrom[chrom] = {}
by_chrom[chrom] = {}
Copiar
Copiado
Copiar
Copiado
intron = (
int(start), int(end)+1)
intron = (
A, B)
if intron in by_chrom[chrom]:
if intron in by_chrom[chrom]:
by_chrom[chrom][intron] += int(count)
by_chrom[chrom][intron] += int(count)
else:
else:
by_chrom[chrom][intron] = int(count)
by_chrom[chrom][intron] = int(count)
Copiar
Copiado
Copiar
Copiado
for clu in cluExons:
for clu in cluExons:
buf = []
buf = []
ks = cluExons[clu]
ks = cluExons[clu]
ks.sort()
ks.sort()
tot = 0
tot = 0
for exon in ks:
for exon in ks:
chrom, start, end = exon
chrom, start, end = exon
Copiar
Copiado
Copiar
Copiado
chrom = tuple(chrom.split(":"))
start, end = int(start), int(end)
start, end = int(start), int(end)
if chrom not in by_chrom:
if chrom not in by_chrom:
pass
pass
elif (start,end) in by_chrom[chrom]:
elif (start,end) in by_chrom[chrom]:
tot += by_chrom[chrom][(start,end)]
tot += by_chrom[chrom][(start,end)]
for exon in ks:
for exon in ks:
chrom, start, end = exon
chrom, start, end = exon
start, end = int(start), int(end)
start, end = int(start), int(end)
Copiar
Copiado
Copiar
Copiado
chrom = tuple(chrom.split(":"))
chromID, strand = chrom
chromID, strand = chrom
if chrom not in by_chrom:
if chrom not in by_chrom:
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot))
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot))
Copiar
Copiado
Copiar
Copiado
elif (start,end) in by_chrom[chrom]:
elif (start,end) in by_chrom[chrom]:
buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot))
buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot))
else:
else:
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot))
buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot))
fout.write("".join(buf))
fout.write("".join(buf))
fout.close()
fout.close()
fout_runlibs.close()
fout_runlibs.close()
def refine_clusters(options):
def refine_clusters(options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
minratio = float(options.mincluratio)
minratio = float(options.mincluratio)
minclureads = int(options.minclureads)
minclureads = int(options.minclureads)
minreads = int(options.minreads)
minreads = int(options.minreads)
inFile = "%s/%s_pooled"%(rundir,outPrefix)
inFile = "%s/%s_pooled"%(rundir,outPrefix)
outFile = "%s/%s_refined"%(rundir,outPrefix)
outFile = "%s/%s_refined"%(rundir,outPrefix)
Copiar
Copiado
Copiar
Copiado
fout = file(outFile,'w')
fout = file(outFile,'w')
Copiar
Copiado
Copiar
Copiado
Ncl = 0
Ncl = 0
for ln in open(inFile):
for ln in open(inFile):
clu = []
clu = []
totN = 0
totN = 0
chrom = ln.split()[0]
chrom = ln.split()[0]
for ex in ln.split()[1:]:
for ex in ln.split()[1:]:
A, B, N = ex.split(":")
A, B, N = ex.split(":")
clu.append(((int(A),int(B)), int(N)))
clu.append(((int(A),int(B)), int(N)))
totN += int(N)
totN += int(N)
Copiar
Copiado
Copiar
Copiado
if totN < minclureads: continue
if totN < minclureads: continue
Copiar
Copiado
Copiar
Copiado
if options.const:
if len(clu) == 1:
buf = '%s ' % chrom
for interval, count in clu:
buf += "%d:%d" % interval + ":%d"%(count)+ " "
Ncl += 1
fout.write(buf+'\n')
for cl in refine_linked(clu):
for cl in refine_linked(clu):
rc = refine_cluster(cl,minratio, minreads)
rc = refine_cluster(cl,minratio, minreads)
Copiar
Copiado
Copiar
Copiado
if len(rc) > 0:
if len(rc) > 0:
for clu in rc:
for clu in rc:
buf = '%s ' % chrom
buf = '%s ' % chrom
for interval, count in clu:
for interval, count in clu:
buf += "%d:%d" % interval + ":%d"%(count)+ " "
buf += "%d:%d" % interval + ":%d"%(count)+ " "
Ncl += 1
Ncl += 1
fout.write(buf+'\n')
fout.write(buf+'\n')
Copiar
Copiado
Copiar
Copiado
sys.stderr.write("Split into %s clusters...\n"%Ncl)
sys.stderr.write("Split into %s clusters...\n"%Ncl)
fout.close()
fout.close()
def merge_junctions(options):
def merge_junctions(options):
''' function to merge junctions '''
''' function to merge junctions '''
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
Copiar
Copiado
Copiar
Copiado
fnameout = "%s/%s"%(rundir,outPrefix)
fnameout = "%s/%s"%(rundir,outPrefix)
Copiar
Copiado
Copiar
Copiado
flist = "%s/%s_sortedlibs"%(rundir, outPrefix)
flist = "%s/%s_sortedlibs"%(rundir, outPrefix)
lsts = []
lsts = []
for ln in open(flist):
for ln in open(flist):
lsts.append(ln.strip())
lsts.append(ln.strip())
if options.verbose:
if options.verbose:
sys.stderr.write("merging %d junction files...\n"%(len(lsts)))
sys.stderr.write("merging %d junction files...\n"%(len(lsts)))
# Change 300 if max open file is < 300
# Change 300 if max open file is < 300
N = min([300, max([100, int(len(lsts)**(0.5))])])
N = min([300, max([100, int(len(lsts)**(0.5))])])
tmpfiles = []
tmpfiles = []
while len(lsts) > 1:
while len(lsts) > 1:
clst = []
clst = []
for i in range(0,(len(lsts)/N)+1):
for i in range(0,(len(lsts)/N)+1):
lst = lsts[N*i:N*(i+1)]
lst = lsts[N*i:N*(i+1)]
if len(lst) > 0:
if len(lst) > 0:
clst.append(lst)
clst.append(lst)
lsts = []
lsts = []
for lst in clst:
for lst in clst:
if len(lst) == 0: continue
if len(lst) == 0: continue
tmpfile = tempfile.mktemp()
tmpfile = tempfile.mktemp()
os.mkdir(tmpfile)
os.mkdir(tmpfile)
foutname = tmpfile+"/tmpmerge.gz"
foutname = tmpfile+"/tmpmerge.gz"
fout = gzip.open(foutname,'w')
fout = gzip.open(foutname,'w')
merge_files(lst, fout, options)
merge_files(lst, fout, options)
lsts.append(foutname)
lsts.append(foutname)
tmpfiles.append(foutname)
tmpfiles.append(foutname)
fout.close()
fout.close()
Copiar
Copiado
Copiar
Copiado
shutil.move(lsts[0], fnameout+"_perind.counts.gz")
if not options.const:
shutil.move(lsts[0], fnameout+"_perind.counts.gz")
else:
shutil.move(lsts[0], fnameout+"_perind.constcounts.gz")
def merge_files(fnames, fout, options):
def merge_files(fnames, fout, options):
fopen = []
fopen = []
for fname in fnames:
for fname in fnames:
if fname[-3:] == ".gz":
if fname[-3:] == ".gz":
fopen.append(gzip.open(fname))
fopen.append(gzip.open(fname))
else:
else:
fopen.append(open(fname))
fopen.append(open(fname))
finished = False
finished = False
N = 0
N = 0
while not finished:
while not finished:
N += 1
N += 1
if N % 50000 == 0:
if N % 50000 == 0:
sys.stderr.write(".")
sys.stderr.write(".")
buf = []
buf = []
for f in fopen:
for f in fopen:
ln = f.readline().split()
ln = f.readline().split()
if len(ln) == 0:
if len(ln) == 0:
finished = True
finished = True
break
break
chrom = ln[0]
chrom = ln[0]
data = ln[1:]
data = ln[1:]
if len(buf) == 0:
if len(buf) == 0:
buf.append(chrom)
buf.append(chrom)
buf += data
buf += data
if len(buf) > 0:
if len(buf) > 0:
if buf[0] == "chrom":
if buf[0] == "chrom":
if options.verbose:
if options.verbose:
sys.stderr.write("merging %d files"%(len(buf)-1))
sys.stderr.write("merging %d files"%(len(buf)-1))
fout.write(" ".join(buf)+'\n')
fout.write(" ".join(buf)+'\n')
else:
else:
break
break
sys.stderr.write(" done.\n")
sys.stderr.write(" done.\n")
for fin in fopen:
for fin in fopen:
fin.close()
fin.close()
def cluster_intervals(E):
def cluster_intervals(E):
''' Clusters intervals together. '''
''' Clusters intervals together. '''
E.sort()
E.sort()
Copiar
Copiado
Copiar
Copiado
if len(E) == 0:
return [], []
current = E[0]
current = E[0]
Eclusters, cluster = [], []
Eclusters, cluster = [], []
i = 0
i = 0
while i < len(E):
while i < len(E):
if overlaps(E[i], current):
if overlaps(E[i], current):
cluster.append(E[i])
cluster.append(E[i])
else:
else:
Eclusters.append(cluster)
Eclusters.append(cluster)
cluster = [E[i]]
cluster = [E[i]]
current = (E[i][0], max([current[1], E[i][1]]))
current = (E[i][0], max([current[1], E[i][1]]))
i += 1
i += 1
if len(cluster) > 0:
if len(cluster) > 0:
Eclusters.append(cluster)
Eclusters.append(cluster)
return Eclusters, E
return Eclusters, E
def overlaps(A,B):
def overlaps(A,B):
'''
'''
Checks if A and B overlaps
Checks if A and B overlaps
'''
'''
if A[1] < B[0] or B[1] < A[0]:
if A[1] < B[0] or B[1] < A[0]:
return False
return False
else: return True
else: return True
def refine_linked(clusters):
def refine_linked(clusters):
unassigned = [x for x in clusters[1:]]
unassigned = [x for x in clusters[1:]]
current = [clusters[0]]
current = [clusters[0]]
splicesites = set([current[0][0][0],current[0][0][1]])
splicesites = set([current[0][0][0],current[0][0][1]])
newClusters = []
newClusters = []
while len(unassigned) > 0:
while len(unassigned) > 0:
finished = False
finished = False
while not finished:
while not finished:
finished = True
finished = True
torm = []
torm = []
for intron in unassigned:
for intron in unassigned:
inter, count = intron
inter, count = intron
start, end = inter
start, end = inter
if start in splicesites or end in splicesites:
if start in splicesites or end in splicesites:
current.append(intron)
current.append(intron)
splicesites.add(start)
splicesites.add(start)
splicesites.add(end)
splicesites.add(end)
finished = False
finished = False
torm.append(intron)
torm.append(intron)
for intron in torm:
for intron in torm:
unassigned.remove(intron)
unassigned.remove(intron)
newClusters.append(current)
newClusters.append(current)
current = []
current = []
if len(unassigned) > 0:
if len(unassigned) > 0:
current = [unassigned[0]]
current = [unassigned[0]]
splicesites = set([current[0][0][0],current[0][0][1]])
splicesites = set([current[0][0][0],current[0][0][1]])
unassigned = unassigned[1:]
unassigned = unassigned[1:]
return newClusters
return newClusters
def refine_cluster(clu, cutoff, readcutoff):
def refine_cluster(clu, cutoff, readcutoff):
''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff,
''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff,
remove and recluster '''
remove and recluster '''
remove = []
remove = []
dic = {}
dic = {}
intervals = []
intervals = []
reCLU = False
reCLU = False
totN = 0
totN = 0
for inter, count in clu:
for inter, count in clu:
totN += count
totN += count
for inter, count in clu:
for inter, count in clu:
if (count/float(totN) >= cutoff and count >= readcutoff):
if (count/float(totN) >= cutoff and count >= readcutoff):
intervals.append(inter)
intervals.append(inter)
dic[inter] = count
dic[inter] = count
else:
else:
reCLU = True
reCLU = True
if len(intervals) == 0: return []
if len(intervals) == 0: return []
# This makes sure that after trimming, the clusters are still good
# This makes sure that after trimming, the clusters are still good
Atmp, B = cluster_intervals(intervals)
Atmp, B = cluster_intervals(intervals)
A = []
A = []
for cl in Atmp:
for cl in Atmp:
for c in refine_linked([(x,0) for x in cl]):
for c in refine_linked([(x,0) for x in cl]):
if len(c) > 0:
if len(c) > 0:
A.append([x[0] for x in c])
A.append([x[0] for x in c])
if len(A) == 1:
if len(A) == 1:
rc = [(x, dic[x]) for x in A[0]]
rc = [(x, dic[x]) for x in A[0]]
if len(rc) > 1:
if len(rc) > 1:
if reCLU:
if reCLU:
return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff)
return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff)
else:
else:
return [[(x, dic[x]) for x in A[0]]]
return [[(x, dic[x]) for x in A[0]]]
else:
else:
return []
return []
NCs = []
NCs = []
for c in A:
for c in A:
if len(c) > 1:
if len(c) > 1:
NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff)
NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff)
NCs += NC
NCs += NC
Copiar
Copiado
Copiar
Copiado
return NCs
return NCs
def get_numers(options):
def get_numers(options):
outPrefix = options.outprefix
outPrefix = options.outprefix
rundir = options.rundir
rundir = options.rundir
Copiar
Copiado
Copiar
Copiado
fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix)
if not options.const:
fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix)
else:
fname = "%s/%s_perind.constcounts.gz"%(rundir,outPrefix)
fnameout = "%s/%s_perind_numers.constcounts.gz"%(rundir,outPrefix)
input_file=gzip.open(fname, 'rb')
input_file=gzip.open(fname, 'rb')
fout = gzip.open(fnameout,'w')
fout = gzip.open(fnameout,'w')
first_line=True
first_line=True
for l in input_file:
for l in input_file:
if first_line:
if first_line:
fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names
fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names
first_line=False
first_line=False
else:
else:
l=l.strip()
l=l.strip()
words=l.split(" ")
words=l.split(" ")
fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n')
fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n')
input_file.close()
input_file.close()
fout.close()
fout.close()
if __name__ == "__main__":
if __name__ == "__main__":
from optparse import OptionParser
from optparse import OptionParser
parser = OptionParser()
parser = OptionParser()
parser.add_option("-j", "--juncfiles", dest="juncfiles",
parser.add_option("-j", "--juncfiles", dest="juncfiles",
help="text file with all junction files to be processed")
help="text file with all junction files to be processed")
parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter',
parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter',
help="output prefix (default leafcutter)")
help="output prefix (default leafcutter)")
parser.add_option("-q", "--quiet",
parser.add_option("-q", "--quiet",
action="store_false", dest="verbose", default=True,
action="store_false", dest="verbose", default=True,
help="don't print status messages to stdout")
help="don't print status messages to stdout")
parser.add_option("-r", "--rundir", dest="rundir", default='./',
parser.add_option("-r", "--rundir", dest="rundir", default='./',
help="write to directory (default ./)")
help="write to directory (default ./)")
parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000,
parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000,
help="maximum intron length in bp (default 100,000bp)")
help="maximum intron length in bp (default 100,000bp)")
parser.add_option("-m", "--minclureads", dest="minclureads", default = 30,
parser.add_option("-m", "--minclureads", dest="minclureads", default = 30,
help="minimum reads in a cluster (default 30 reads)")
help="minimum reads in a cluster (default 30 reads)")
parser.add_option("-M", "--minreads", dest="minreads", default = 5,
parser.add_option("-M", "--minreads", dest="minreads", default = 5,
Copiar
Copiado
Copiar
Copiado
help="minimum reads
for an intron
(default 5 reads)")
help="minimum reads
in a cluster
(default 5 reads)")
parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001,
parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001,
help="minimum fraction of reads in a cluster that support a junction (default 0.001)")
help="minimum fraction of reads in a cluster that support a junction (default 0.001)")
Copiar
Copiado
Copiar
Copiado
parser.add_option("-k", "--checkchrom", dest="checkchrom", default = False,
help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ... (default=False)")
Copiar
Copiado
Copiar
Copiado
parser.add_option("-
s
", "--
strand
", dest="
strand",
default = False,
parser.add_option("-
c
", "--
cluster
", dest="
cluster", default = None,
help="
use strand info (
default
=
False
)
")
help="refined cluster file when clusters are already made")
parser.add_option("-k", "--checkchrom", dest="checkchrom", action="store_true",
default = False,
help="
check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ...")
parser.add_option("-C", "--includeconst", dest="const", action="store_true",
default
=
False
,
help="also include constitutive introns
")
Copiar
Copiado
Copiar
Copiado
parser.add_option("-s", "--strand", dest="strand", default = True,
help="use strand info (default=True)")
(options, args) = parser.parse_args()
(options, args) = parser.parse_args()
if options.juncfiles == None:
if options.juncfiles == None:
sys.stderr.write("Error: no junction file provided...\n")
sys.stderr.write("Error: no junction file provided...\n")
exit(0)
exit(0)
# Get the junction file list
# Get the junction file list
libl = []
libl = []
for junc in open(options.juncfiles):
for junc in open(options.juncfiles):
junc = junc.strip()
junc = junc.strip()
try:
try:
open(junc)
open(junc)
except:
except:
sys.stderr.write("%s does not exist... check your junction files.\n"%junc)
sys.stderr.write("%s does not exist... check your junction files.\n"%junc)
exit(0)
exit(0)
libl.append(junc)
libl.append(junc)
main(options, libl)
main(options, libl)
Diferenças salvas
Texto original
Abrir arquivo
#!/usr/bin/env python import sys import tempfile import os import gzip import shutil def main(options,libl): pool_junc_reads(libl, options) refine_clusters(options) sort_junctions(libl, options) merge_junctions(options) get_numers(options) def pool_junc_reads(flist, options): outPrefix = options.outprefix rundir = options.rundir maxIntronLen = int(options.maxintronlen) checkchrom = options.checkchrom useStrand = options.strand outFile = "%s/%s_pooled"%(rundir,outPrefix) chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] by_chrom = {} for libl in flist: lib = libl.strip() if not os.path.isfile(lib): continue if options.verbose: sys.stderr.write("scanning %s...\n"%lib) if lib[-3:] == ".gz": F = gzip.open(lib) else: F = open(lib) for ln in F: lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, counts, strand = lnsplit if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue A, B = int(A), int(B)+1 if B-A > int(maxIntronLen): continue try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom, strand)][(A,B)] except: try: by_chrom[(chrom,strand)][(A,B)] = int(counts) except: by_chrom[(chrom, strand)] = {(A,B):int(counts)} fout = file(outFile, 'w') Ncluster = 0 sys.stderr.write("Parsing...\n") for chrom in by_chrom: read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads read_ks.sort() sys.stderr.write("%s:%s.."%chrom) clu = cluster_intervals(read_ks)[0] for cl in clu: if len(cl) > 1: # if cluster has more than one intron buf = '%s:%s '%chrom for interval, count in [(x, by_chrom[chrom][x]) for x in cl]: buf += "%d:%d" % interval + ":%d"%count+ " " fout.write(buf+'\n') Ncluster += 1 sys.stderr.write("\nWrote %d clusters..\n"%Ncluster) fout.close() def sort_junctions(libl, options): chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] outPrefix = options.outprefix rundir = options.rundir refined_cluster = "%s/%s_refined"%(rundir,outPrefix) checkchrom = options.checkchrom useStrand = options.strand runName = "%s/%s"%(rundir, outPrefix) exons, cluExons = {}, {} cluN = 0 for ln in open(refined_cluster): chrom = ln.split()[0] chrom = tuple(chrom.split(":")) cluN += 1 for exon in ln.split()[1:]: A, B, count = exon.split(":") if chrom not in exons: exons[chrom] = {} exons[chrom][(int(A),int(B))] = cluN if cluN not in cluExons: cluExons[cluN] = [] cluExons[cluN].append((chrom, A, B)) merges = {} for ll in libl: lib=ll.rstrip() if not os.path.isfile(lib): continue libN = lib if libN not in merges: merges[libN] = [] merges[libN].append(lib) fout_runlibs = file(runName+"_sortedlibs",'w') for libN in merges: libName = "%s/%s"%(rundir,libN.split('/')[-1]) by_chrom = {} foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1]) fout_runlibs.write(foutName+'\n') if options.verbose: sys.stderr.write("Sorting %s..\n"%libN) if len(merges[libN]) > 1: if options.verbose: sys.stderr.write("merging %s...\n"%(" ".join(merges[libN]))) else: pass fout = gzip.open(foutName,'w') fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0]) for lib in merges[libN]: if lib[-3:] == ".gz": F = gzip.open(lib) else: F = open(lib) for ln in F: lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, start, end, dot, count, strand = ln.split() if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue chrom = (chrom, strand) if chrom not in by_chrom: by_chrom[chrom] = {} intron = (int(start), int(end)+1) if intron in by_chrom[chrom]: by_chrom[chrom][intron] += int(count) else: by_chrom[chrom][intron] = int(count) for clu in cluExons: buf = [] ks = cluExons[clu] ks.sort() tot = 0 for exon in ks: chrom, start, end = exon start, end = int(start), int(end) if chrom not in by_chrom: pass elif (start,end) in by_chrom[chrom]: tot += by_chrom[chrom][(start,end)] for exon in ks: chrom, start, end = exon start, end = int(start), int(end) chromID, strand = chrom if chrom not in by_chrom: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot)) elif (start,end) in by_chrom[chrom]: buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot)) else: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot)) fout.write("".join(buf)) fout.close() fout_runlibs.close() def refine_clusters(options): outPrefix = options.outprefix rundir = options.rundir minratio = float(options.mincluratio) minclureads = int(options.minclureads) minreads = int(options.minreads) inFile = "%s/%s_pooled"%(rundir,outPrefix) outFile = "%s/%s_refined"%(rundir,outPrefix) fout = file(outFile,'w') Ncl = 0 for ln in open(inFile): clu = [] totN = 0 chrom = ln.split()[0] for ex in ln.split()[1:]: A, B, N = ex.split(":") clu.append(((int(A),int(B)), int(N))) totN += int(N) if totN < minclureads: continue for cl in refine_linked(clu): rc = refine_cluster(cl,minratio, minreads) if len(rc) > 0: for clu in rc: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') sys.stderr.write("Split into %s clusters...\n"%Ncl) fout.close() def merge_junctions(options): ''' function to merge junctions ''' outPrefix = options.outprefix rundir = options.rundir fnameout = "%s/%s"%(rundir,outPrefix) flist = "%s/%s_sortedlibs"%(rundir, outPrefix) lsts = [] for ln in open(flist): lsts.append(ln.strip()) if options.verbose: sys.stderr.write("merging %d junction files...\n"%(len(lsts))) # Change 300 if max open file is < 300 N = min([300, max([100, int(len(lsts)**(0.5))])]) tmpfiles = [] while len(lsts) > 1: clst = [] for i in range(0,(len(lsts)/N)+1): lst = lsts[N*i:N*(i+1)] if len(lst) > 0: clst.append(lst) lsts = [] for lst in clst: if len(lst) == 0: continue tmpfile = tempfile.mktemp() os.mkdir(tmpfile) foutname = tmpfile+"/tmpmerge.gz" fout = gzip.open(foutname,'w') merge_files(lst, fout, options) lsts.append(foutname) tmpfiles.append(foutname) fout.close() shutil.move(lsts[0], fnameout+"_perind.counts.gz") def merge_files(fnames, fout, options): fopen = [] for fname in fnames: if fname[-3:] == ".gz": fopen.append(gzip.open(fname)) else: fopen.append(open(fname)) finished = False N = 0 while not finished: N += 1 if N % 50000 == 0: sys.stderr.write(".") buf = [] for f in fopen: ln = f.readline().split() if len(ln) == 0: finished = True break chrom = ln[0] data = ln[1:] if len(buf) == 0: buf.append(chrom) buf += data if len(buf) > 0: if buf[0] == "chrom": if options.verbose: sys.stderr.write("merging %d files"%(len(buf)-1)) fout.write(" ".join(buf)+'\n') else: break sys.stderr.write(" done.\n") for fin in fopen: fin.close() def cluster_intervals(E): ''' Clusters intervals together. ''' E.sort() if len(E) == 0: return [], [] current = E[0] Eclusters, cluster = [], [] i = 0 while i < len(E): if overlaps(E[i], current): cluster.append(E[i]) else: Eclusters.append(cluster) cluster = [E[i]] current = (E[i][0], max([current[1], E[i][1]])) i += 1 if len(cluster) > 0: Eclusters.append(cluster) return Eclusters, E def overlaps(A,B): ''' Checks if A and B overlaps ''' if A[1] < B[0] or B[1] < A[0]: return False else: return True def refine_linked(clusters): unassigned = [x for x in clusters[1:]] current = [clusters[0]] splicesites = set([current[0][0][0],current[0][0][1]]) newClusters = [] while len(unassigned) > 0: finished = False while not finished: finished = True torm = [] for intron in unassigned: inter, count = intron start, end = inter if start in splicesites or end in splicesites: current.append(intron) splicesites.add(start) splicesites.add(end) finished = False torm.append(intron) for intron in torm: unassigned.remove(intron) newClusters.append(current) current = [] if len(unassigned) > 0: current = [unassigned[0]] splicesites = set([current[0][0][0],current[0][0][1]]) unassigned = unassigned[1:] return newClusters def refine_cluster(clu, cutoff, readcutoff): ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff, remove and recluster ''' remove = [] dic = {} intervals = [] reCLU = False totN = 0 for inter, count in clu: totN += count for inter, count in clu: if (count/float(totN) >= cutoff and count >= readcutoff): intervals.append(inter) dic[inter] = count else: reCLU = True if len(intervals) == 0: return [] # This makes sure that after trimming, the clusters are still good Atmp, B = cluster_intervals(intervals) A = [] for cl in Atmp: for c in refine_linked([(x,0) for x in cl]): if len(c) > 0: A.append([x[0] for x in c]) if len(A) == 1: rc = [(x, dic[x]) for x in A[0]] if len(rc) > 1: if reCLU: return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff) else: return [[(x, dic[x]) for x in A[0]]] else: return [] NCs = [] for c in A: if len(c) > 1: NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff) NCs += NC return NCs def get_numers(options): outPrefix = options.outprefix rundir = options.rundir fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix) input_file=gzip.open(fname, 'rb') fout = gzip.open(fnameout,'w') first_line=True for l in input_file: if first_line: fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names first_line=False else: l=l.strip() words=l.split(" ") fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n') input_file.close() fout.close() if __name__ == "__main__": from optparse import OptionParser parser = OptionParser() parser.add_option("-j", "--juncfiles", dest="juncfiles", help="text file with all junction files to be processed") parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter', help="output prefix (default leafcutter)") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") parser.add_option("-r", "--rundir", dest="rundir", default='./', help="write to directory (default ./)") parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, help="maximum intron length in bp (default 100,000bp)") parser.add_option("-m", "--minclureads", dest="minclureads", default = 30, help="minimum reads in a cluster (default 30 reads)") parser.add_option("-M", "--minreads", dest="minreads", default = 5, help="minimum reads for an intron (default 5 reads)") parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001, help="minimum fraction of reads in a cluster that support a junction (default 0.001)") parser.add_option("-k", "--checkchrom", dest="checkchrom", default = False, help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ... (default=False)") parser.add_option("-s", "--strand", dest="strand", default = False, help="use strand info (default=False)") (options, args) = parser.parse_args() if options.juncfiles == None: sys.stderr.write("Error: no junction file provided...\n") exit(0) # Get the junction file list libl = [] for junc in open(options.juncfiles): junc = junc.strip() try: open(junc) except: sys.stderr.write("%s does not exist... check your junction files.\n"%junc) exit(0) libl.append(junc) main(options, libl)
Texto alterado
Abrir arquivo
# https://github.com/griffithlab/regtools # /home/yangili1/tools/regtools/build/regtools junctions extract -a 8 -i 50 -I 500000 bamfile.bam -o outfile.junc # Using regtools speeds up the junction extraction step by an order of magnitude or more import sys import tempfile import os import gzip import shutil def main(options,libl): if options.cluster == None: pool_junc_reads(libl, options) refine_clusters(options) sort_junctions(libl, options) merge_junctions(options) get_numers(options) def pool_junc_reads(flist, options): outPrefix = options.outprefix rundir = options.rundir maxIntronLen = int(options.maxintronlen) checkchrom = options.checkchrom useStrand = options.strand outFile = "%s/%s_pooled"%(rundir,outPrefix) chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] by_chrom = {} for libl in flist: lib = libl.strip() if not os.path.isfile(lib): continue if options.verbose: sys.stderr.write("scanning %s...\n"%lib) for ln in open(lib): lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, counts, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit if int(blockCount) > 2: print ln continue if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue Aoff, Boff = blockSize.split(",") A, B = int(A)+int(Aoff), int(B)-int(Boff)+1 if B-A > int(maxIntronLen): continue try: by_chrom[(chrom,strand)][(A,B)] = int(counts) + by_chrom[(chrom,strand)][(A,B)] except: try: by_chrom[(chrom,strand)][(A,B)] = int(counts) except: by_chrom[(chrom,strand)] = {(A,B):int(counts)} fout = file(outFile, 'w') Ncluster = 0 sys.stderr.write("Parsing...\n") for chrom in by_chrom: read_ks = [k for k,v in by_chrom[chrom].items() if v >= 3] # a junction must have at least 3 reads read_ks.sort() sys.stderr.write("%s:%s.."%chrom) clu = cluster_intervals(read_ks)[0] for cl in clu: if len(cl) > 0: # 1 if cluster has more than one intron buf = '%s:%s '%chrom for interval, count in [(x, by_chrom[chrom][x]) for x in cl]: buf += "%d:%d" % interval + ":%d"%count+ " " fout.write(buf+'\n') Ncluster += 1 sys.stderr.write("\nWrote %d clusters..\n"%Ncluster) fout.close() def sort_junctions(libl, options): chromLst = ["chr%d"%x for x in range(1,23)]+['chrX','chrY']+["%d"%x for x in range(1,23)]+['X','Y'] outPrefix = options.outprefix rundir = options.rundir checkchrom = options.checkchrom useStrand = options.strand if options.cluster == None: refined_cluster = "%s/%s_refined"%(rundir,outPrefix) else: refined_cluster = options.cluster runName = "%s/%s"%(rundir, outPrefix) exons, cluExons = {}, {} cluN = 0 for ln in open(refined_cluster): chrom = ln.split()[0] cluN += 1 for exon in ln.split()[1:]: A, B, count = exon.split(":") if chrom not in exons: exons[chrom] = {} exons[chrom][(int(A),int(B))] = cluN if cluN not in cluExons: cluExons[cluN] = [] cluExons[cluN].append((chrom, A, B)) merges = {} for ll in libl: lib=ll.rstrip() if not os.path.isfile(lib): continue libN = lib if libN not in merges: merges[libN] = [] merges[libN].append(lib) fout_runlibs = file(runName+"_sortedlibs",'w') for libN in merges: libName = "%s/%s"%(rundir,libN.split('/')[-1]) by_chrom = {} foutName = libName+'.%s.sorted.gz'%(runName.split("/")[-1]) fout_runlibs.write(foutName+'\n') if options.verbose: sys.stderr.write("Sorting %s..\n"%libN) if len(merges[libN]) > 1: if options.verbose: sys.stderr.write("merging %s...\n"%(" ".join(merges[libN]))) else: pass fout = gzip.open(foutName,'w') fout.write("chrom %s\n"%libN.split("/")[-1].split(".junc")[0]) for lib in merges[libN]: for ln in open(lib): lnsplit=ln.split() if len(lnsplit)<6: sys.stderr.write("Error in %s \n" % lib) continue chrom, A, B, dot, count, strand, rA,rb, rgb, blockCount, blockSize, blockStarts = lnsplit if int(blockCount) > 2: print ln continue if not useStrand: strand = "NA" if checkchrom and (chrom not in chromLst): continue Aoff, Boff = blockSize.split(",") A, B = int(A)+int(Aoff), int(B)-int(Boff)+1 chrom = (chrom,strand) if chrom not in by_chrom: by_chrom[chrom] = {} intron = (A, B) if intron in by_chrom[chrom]: by_chrom[chrom][intron] += int(count) else: by_chrom[chrom][intron] = int(count) for clu in cluExons: buf = [] ks = cluExons[clu] ks.sort() tot = 0 for exon in ks: chrom, start, end = exon chrom = tuple(chrom.split(":")) start, end = int(start), int(end) if chrom not in by_chrom: pass elif (start,end) in by_chrom[chrom]: tot += by_chrom[chrom][(start,end)] for exon in ks: chrom, start, end = exon start, end = int(start), int(end) chrom = tuple(chrom.split(":")) chromID, strand = chrom if chrom not in by_chrom: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu, strand, tot)) elif (start,end) in by_chrom[chrom]: buf.append("%s:%d:%d:clu_%d_%s %d/%d\n"%(chromID,start, end, clu,strand, by_chrom[chrom][(start,end)], tot)) else: buf.append("%s:%d:%d:clu_%d_%s 0/%d\n"%(chromID,start, end,clu,strand, tot)) fout.write("".join(buf)) fout.close() fout_runlibs.close() def refine_clusters(options): outPrefix = options.outprefix rundir = options.rundir minratio = float(options.mincluratio) minclureads = int(options.minclureads) minreads = int(options.minreads) inFile = "%s/%s_pooled"%(rundir,outPrefix) outFile = "%s/%s_refined"%(rundir,outPrefix) fout = file(outFile,'w') Ncl = 0 for ln in open(inFile): clu = [] totN = 0 chrom = ln.split()[0] for ex in ln.split()[1:]: A, B, N = ex.split(":") clu.append(((int(A),int(B)), int(N))) totN += int(N) if totN < minclureads: continue if options.const: if len(clu) == 1: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') for cl in refine_linked(clu): rc = refine_cluster(cl,minratio, minreads) if len(rc) > 0: for clu in rc: buf = '%s ' % chrom for interval, count in clu: buf += "%d:%d" % interval + ":%d"%(count)+ " " Ncl += 1 fout.write(buf+'\n') sys.stderr.write("Split into %s clusters...\n"%Ncl) fout.close() def merge_junctions(options): ''' function to merge junctions ''' outPrefix = options.outprefix rundir = options.rundir fnameout = "%s/%s"%(rundir,outPrefix) flist = "%s/%s_sortedlibs"%(rundir, outPrefix) lsts = [] for ln in open(flist): lsts.append(ln.strip()) if options.verbose: sys.stderr.write("merging %d junction files...\n"%(len(lsts))) # Change 300 if max open file is < 300 N = min([300, max([100, int(len(lsts)**(0.5))])]) tmpfiles = [] while len(lsts) > 1: clst = [] for i in range(0,(len(lsts)/N)+1): lst = lsts[N*i:N*(i+1)] if len(lst) > 0: clst.append(lst) lsts = [] for lst in clst: if len(lst) == 0: continue tmpfile = tempfile.mktemp() os.mkdir(tmpfile) foutname = tmpfile+"/tmpmerge.gz" fout = gzip.open(foutname,'w') merge_files(lst, fout, options) lsts.append(foutname) tmpfiles.append(foutname) fout.close() if not options.const: shutil.move(lsts[0], fnameout+"_perind.counts.gz") else: shutil.move(lsts[0], fnameout+"_perind.constcounts.gz") def merge_files(fnames, fout, options): fopen = [] for fname in fnames: if fname[-3:] == ".gz": fopen.append(gzip.open(fname)) else: fopen.append(open(fname)) finished = False N = 0 while not finished: N += 1 if N % 50000 == 0: sys.stderr.write(".") buf = [] for f in fopen: ln = f.readline().split() if len(ln) == 0: finished = True break chrom = ln[0] data = ln[1:] if len(buf) == 0: buf.append(chrom) buf += data if len(buf) > 0: if buf[0] == "chrom": if options.verbose: sys.stderr.write("merging %d files"%(len(buf)-1)) fout.write(" ".join(buf)+'\n') else: break sys.stderr.write(" done.\n") for fin in fopen: fin.close() def cluster_intervals(E): ''' Clusters intervals together. ''' E.sort() current = E[0] Eclusters, cluster = [], [] i = 0 while i < len(E): if overlaps(E[i], current): cluster.append(E[i]) else: Eclusters.append(cluster) cluster = [E[i]] current = (E[i][0], max([current[1], E[i][1]])) i += 1 if len(cluster) > 0: Eclusters.append(cluster) return Eclusters, E def overlaps(A,B): ''' Checks if A and B overlaps ''' if A[1] < B[0] or B[1] < A[0]: return False else: return True def refine_linked(clusters): unassigned = [x for x in clusters[1:]] current = [clusters[0]] splicesites = set([current[0][0][0],current[0][0][1]]) newClusters = [] while len(unassigned) > 0: finished = False while not finished: finished = True torm = [] for intron in unassigned: inter, count = intron start, end = inter if start in splicesites or end in splicesites: current.append(intron) splicesites.add(start) splicesites.add(end) finished = False torm.append(intron) for intron in torm: unassigned.remove(intron) newClusters.append(current) current = [] if len(unassigned) > 0: current = [unassigned[0]] splicesites = set([current[0][0][0],current[0][0][1]]) unassigned = unassigned[1:] return newClusters def refine_cluster(clu, cutoff, readcutoff): ''' for each exon in the cluster compute the ratio of reads, if smaller than cutoff, remove and recluster ''' remove = [] dic = {} intervals = [] reCLU = False totN = 0 for inter, count in clu: totN += count for inter, count in clu: if (count/float(totN) >= cutoff and count >= readcutoff): intervals.append(inter) dic[inter] = count else: reCLU = True if len(intervals) == 0: return [] # This makes sure that after trimming, the clusters are still good Atmp, B = cluster_intervals(intervals) A = [] for cl in Atmp: for c in refine_linked([(x,0) for x in cl]): if len(c) > 0: A.append([x[0] for x in c]) if len(A) == 1: rc = [(x, dic[x]) for x in A[0]] if len(rc) > 1: if reCLU: return refine_cluster([(x, dic[x]) for x in A[0]], cutoff, readcutoff) else: return [[(x, dic[x]) for x in A[0]]] else: return [] NCs = [] for c in A: if len(c) > 1: NC = refine_cluster([(x, dic[x]) for x in c], cutoff, readcutoff) NCs += NC return NCs def get_numers(options): outPrefix = options.outprefix rundir = options.rundir if not options.const: fname = "%s/%s_perind.counts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.counts.gz"%(rundir,outPrefix) else: fname = "%s/%s_perind.constcounts.gz"%(rundir,outPrefix) fnameout = "%s/%s_perind_numers.constcounts.gz"%(rundir,outPrefix) input_file=gzip.open(fname, 'rb') fout = gzip.open(fnameout,'w') first_line=True for l in input_file: if first_line: fout.write(" ".join(l.strip().split(" ")[1:])+'\n') # print the sample names first_line=False else: l=l.strip() words=l.split(" ") fout.write(words[0]+ " "+ " ".join( [ g.split("/")[0] for g in words[1:] ] ) +'\n') input_file.close() fout.close() if __name__ == "__main__": from optparse import OptionParser parser = OptionParser() parser.add_option("-j", "--juncfiles", dest="juncfiles", help="text file with all junction files to be processed") parser.add_option("-o", "--outprefix", dest="outprefix", default = 'leafcutter', help="output prefix (default leafcutter)") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") parser.add_option("-r", "--rundir", dest="rundir", default='./', help="write to directory (default ./)") parser.add_option("-l", "--maxintronlen", dest="maxintronlen", default = 100000, help="maximum intron length in bp (default 100,000bp)") parser.add_option("-m", "--minclureads", dest="minclureads", default = 30, help="minimum reads in a cluster (default 30 reads)") parser.add_option("-M", "--minreads", dest="minreads", default = 5, help="minimum reads in a cluster (default 5 reads)") parser.add_option("-p", "--mincluratio", dest="mincluratio", default = 0.001, help="minimum fraction of reads in a cluster that support a junction (default 0.001)") parser.add_option("-c", "--cluster", dest="cluster", default = None, help="refined cluster file when clusters are already made") parser.add_option("-k", "--checkchrom", dest="checkchrom", action="store_true",default = False, help="check that the chromosomes are well formated e.g. chr1, chr2, ..., or 1, 2, ...") parser.add_option("-C", "--includeconst", dest="const", action="store_true",default = False, help="also include constitutive introns") parser.add_option("-s", "--strand", dest="strand", default = True, help="use strand info (default=True)") (options, args) = parser.parse_args() if options.juncfiles == None: sys.stderr.write("Error: no junction file provided...\n") exit(0) # Get the junction file list libl = [] for junc in open(options.juncfiles): junc = junc.strip() try: open(junc) except: sys.stderr.write("%s does not exist... check your junction files.\n"%junc) exit(0) libl.append(junc) main(options, libl)
Encontrar Diferença