Diff
checker
Texto
Texto
Imágenes
Documentos
Excel
Carpetas
Legal
Enterprise
Aplicación de escritorio
Precios
Iniciar sesión
Descargar Diffchecker Desktop
Comparar texto
Encuentra la diferencia entre dos archivos de texto
Herramientas
Historial
Editor live
Ocultar sin cambios
Sin ajuste de línea
Vista
Dividido
Unificado
Nivel de detalle
Inteligente
Palabra
Letra
Resaltado de sintaxis
Elegir sintaxis
Ignorar
Transformar texto
Ir al primer cambio
Editar entrada
Diffchecker Desktop
La forma más segura de usar Diffchecker. ¡Obtén la app de Diffchecker Desktop: tus diffs nunca salen de tu computadora!
Obtener Desktop
Untitled diff
Creado
hace 11 años
El diff nunca expira
Borrar
Exportar
Compartir
Explicar
31 eliminaciones
Líneas
Total
Eliminado
Caracteres
Total
Eliminado
Para continuar usando esta función, actualice a
Diff
checker
Pro
Ver precios
606 líneas
Copiar todo
30 adiciones
Líneas
Total
Añadido
Caracteres
Total
Añadido
Para continuar usando esta función, actualice a
Diff
checker
Pro
Ver precios
604 líneas
Copiar todo
#LOH
#LOH
probefile=r'C:\Users\ly091\Documents\probes_SNP6.0_hg18.txt'
probefile=r'C:\Users\ly091\Documents\probes_SNP6.0_hg18.txt'
Copiar
Copiado
Copiar
Copiado
##SampleScoreFile = r'C:\Users\ly091\Documents\result8\
Loss
\tumor.
Loss
.Score.txt'
##SampleScoreFile = r'C:\Users\ly091\Documents\result8\
Gain
\tumor.
Gain
.Score.txt'
##ControlScoreFile = r'C:\Users\ly091\Documents\result8\
Loss
\Germline.
Loss
.Score.txt'
##ControlScoreFile = r'C:\Users\ly091\Documents\result8\
Gain
\Germline.
Gain
.Score.txt'
GScoreThreshold=0.01#minimum Gscore to be recorded
GScoreThreshold=0.01#minimum Gscore to be recorded
Recordstoread=10000000#for debug
Recordstoread=10000000#for debug
EchoPerLines=1000#verbose
EchoPerLines=1000#verbose
GaptoMerge=500#minimun width of a gap between two peaks to be merged, unit basepair
GaptoMerge=500#minimun width of a gap between two peaks to be merged, unit basepair
MinimumPeakWidth=2000#minimum Peak Width, unit base pair
MinimumPeakWidth=2000#minimum Peak Width, unit base pair
pvaluelimit=0.01
pvaluelimit=0.01
#Global variables
#Global variables
probes=list()#from the probe file, will be filled and sorted
probes=list()#from the probe file, will be filled and sorted
FirstMarkersonChromosomes=dict()#First markers of chromosomes
FirstMarkersonChromosomes=dict()#First markers of chromosomes
LastMarkersonChromosomes=dict() #Last markers of chromosomes
LastMarkersonChromosomes=dict() #Last markers of chromosomes
#Functions
#Functions
def mcnemar(a,b,c,d):
def mcnemar(a,b,c,d):
"""
"""
Input args:
Input args:
a, b, c, d- frequencies (a: ++, b: +-, c: -+, d: --) b and c is interchangable
a, b, c, d- frequencies (a: ++, b: +-, c: -+, d: --) b and c is interchangable
Output:
Output:
pvalue of test.
pvalue of test.
"""
"""
if b+c>0:
if b+c>0:
chi2testval = (abs(b-c) - 1)**2/ (b + c)
chi2testval = (abs(b-c) - 1)**2/ (b + c)
df = 1
df = 1
pvalue = 1 - stats.chi2.cdf(chi2testval, df)
pvalue = 1 - stats.chi2.cdf(chi2testval, df)
else:
else:
pvalue=1.0
pvalue=1.0
return pvalue
return pvalue
def AbeforeB(a,b):
def AbeforeB(a,b):
#compare postions of two probes, a and b contain: [chromosome, base pair position]
#compare postions of two probes, a and b contain: [chromosome, base pair position]
if a[0]<b[0]:
if a[0]<b[0]:
return True
return True
elif a[0]>b[0]:
elif a[0]>b[0]:
return False
return False
else: #case: in the same chromosome
else: #case: in the same chromosome
if a[1]<b[1]:
if a[1]<b[1]:
return True
return True
elif a[1]>b[1]:
elif a[1]>b[1]:
return False
return False
else:
else:
return True #same position, return True here
return True #same position, return True here
def LocateProbe(probe):#probe has: chromosome number, start size
def LocateProbe(probe):#probe has: chromosome number, start size
p1=0
p1=0
p2=len(probes)-1
p2=len(probes)-1
while True:
while True:
pcenter=int((p1+p2)/2)
pcenter=int((p1+p2)/2)
#print("p1="+str(p1)+"\tp2="+str(p2)+"\tpcenter="+str(pcenter))
#print("p1="+str(p1)+"\tp2="+str(p2)+"\tpcenter="+str(pcenter))
if probes[pcenter]==probe:
if probes[pcenter]==probe:
return pcenter
return pcenter
elif probes[p1]==probe:
elif probes[p1]==probe:
return p1
return p1
elif probes[p2]==probe:
elif probes[p2]==probe:
return p2
return p2
elif AbeforeB(probes[pcenter],probe):#take the right half
elif AbeforeB(probes[pcenter],probe):#take the right half
if p1!=pcenter:
if p1!=pcenter:
p1=pcenter
p1=pcenter
else:
else:
print("cannot find probe")
print("cannot find probe")
print(str(probe))
print(str(probe))
exit(577)
exit(577)
else: #take the left half
else: #take the left half
if p2!=pcenter:
if p2!=pcenter:
p2=pcenter
p2=pcenter
else:
else:
print("cannot find probe")
print("cannot find probe")
print(str(probe))
print(str(probe))
exit(577)
exit(577)
def LeftTarget(probe):#probe has: chromosome number, start size
def LeftTarget(probe):#probe has: chromosome number, start size
## print(probe)
## print(probe)
p1=0
p1=0
## print(len(gtarget_numbers))
## print(len(gtarget_numbers))
while p1<len(gtarget_numbers):
while p1<len(gtarget_numbers):
if gtarget_numbers[p1][1]==probe[0]:#same chromosome
if gtarget_numbers[p1][1]==probe[0]:#same chromosome
#print("goes here")
#print("goes here")
if gtarget_numbers[p1][2]>=probe[1]:#first meet, note gtarget_numbers is ordered
if gtarget_numbers[p1][2]>=probe[1]:#first meet, note gtarget_numbers is ordered
if p1==0:
if p1==0:
return 0
return 0
else:
else:
return p1-1
return p1-1
p1+=1
p1+=1
if p1==len(gtarget_numbers):
if p1==len(gtarget_numbers):
return -111111111111
return -111111111111
def RightTarget(probe):#probe has: chromosome number, start size
def RightTarget(probe):#probe has: chromosome number, start size
# print(probe)
# print(probe)
p1=1
p1=1
while p1<len(gtarget_numbers):
while p1<len(gtarget_numbers):
## print(gtarget_numbers[-p1])
## print(gtarget_numbers[-p1])
## print(p1)
## print(p1)
## print(probe)
## print(probe)
if gtarget_numbers[-p1][1]==probe[0]:#same chromosome
if gtarget_numbers[-p1][1]==probe[0]:#same chromosome
if gtarget_numbers[-p1][3]<=probe[1]:#first meet from right, note gtarget_numbers is ordered
if gtarget_numbers[-p1][3]<=probe[1]:#first meet from right, note gtarget_numbers is ordered
if p1==1:
if p1==1:
return len(gtarget_numbers)-1
return len(gtarget_numbers)-1
else:
else:
return len(gtarget_numbers)-p1+1
return len(gtarget_numbers)-p1+1
p1+=1
p1+=1
if p1==len(gtarget_numbers):
if p1==len(gtarget_numbers):
return p1
return p1
def CountMarkers(nchr,nstart,nend):
def CountMarkers(nchr,nstart,nend):
## print("in countmarkers")
## print("in countmarkers")
## print(nchr)
## print(nchr)
## print(nstart)
## print(nstart)
## print(nend)
## print(nend)
return LocateProbe([nchr,nend])-LocateProbe([nchr,nstart])+1
return LocateProbe([nchr,nend])-LocateProbe([nchr,nstart])+1
def PreviousProbeOf(element):
def PreviousProbeOf(element):
#element is a list with chromosome and probe position, e.g.[2,242953866]
#element is a list with chromosome and probe position, e.g.[2,242953866]
n=LocateProbe(element)-1
n=LocateProbe(element)-1
if n<0:
if n<0:
return 0
return 0
else:
else:
return n
return n
def NextProbeOf(element):
def NextProbeOf(element):
n= LocateProbe(element)+1
n= LocateProbe(element)+1
if n>len(probes)-1:
if n>len(probes)-1:
return len(probes)-1
return len(probes)-1
else:
else:
return n
return n
def RelationshipOf(seg1,seg2):
def RelationshipOf(seg1,seg2):
## function RelationshipOf(seg1 as list, seg2 as list)
## function RelationshipOf(seg1 as list, seg2 as list)
## compare two segments seg1 and seg2,
## compare two segments seg1 and seg2,
## seg1 and seg2 are two lists, each with the Min and Max values of the segment,
## seg1 and seg2 are two lists, each with the Min and Max values of the segment,
## Min should be the first value in the list and Max the second.
## Min should be the first value in the list and Max the second.
## return a string indicating the relationship between the two segments:
## return a string indicating the relationship between the two segments:
## "equal" : identical
## "equal" : identical
## "no relationship" : seg1 and seg2 are not in the same locus, they don't overlap
## "no relationship" : seg1 and seg2 are not in the same locus, they don't overlap
## "in": seg1 is entirely within seg2
## "in": seg1 is entirely within seg2
## "cover": seg1 is bigger than seg2 and completely contains seg2
## "cover": seg1 is bigger than seg2 and completely contains seg2
## "overlap": seg1 partially overlaps with seg2
## "overlap": seg1 partially overlaps with seg2
if seg1[0]==seg2[0] and seg1[1]==seg2[1]:
if seg1[0]==seg2[0] and seg1[1]==seg2[1]:
return "equal"
return "equal"
elif seg1[0]>=seg2[0] and seg1[1]<=seg2[1]:
elif seg1[0]>=seg2[0] and seg1[1]<=seg2[1]:
return "in"
return "in"
elif seg1[0]<=seg2[0] and seg1[1]>=seg2[1]:
elif seg1[0]<=seg2[0] and seg1[1]>=seg2[1]:
return "cover"
return "cover"
if (seg1[0]<seg2[0] and seg1[1]<seg2[1] and seg1[1]>seg2[0]) or (seg1[0]>seg2[0] and seg1[1]>seg2[1] and seg1[0]<seg2[1]) :
if (seg1[0]<seg2[0] and seg1[1]<seg2[1] and seg1[1]>seg2[0]) or (seg1[0]>seg2[0] and seg1[1]>seg2[1] and seg1[0]<seg2[1]) :
return "overlap"
return "overlap"
else :
else :
return "no relationship"
return "no relationship"
def ZeroFillSpaces(StartProbeIndex,EndProbeIndex):
def ZeroFillSpaces(StartProbeIndex,EndProbeIndex):
#return a list of lists each is a properly formated segment with value of 0.0
#return a list of lists each is a properly formated segment with value of 0.0
#StartProbeINdex and EndProbeIndex are two pointers to positions in the "probes" list which is sorted
#StartProbeINdex and EndProbeIndex are two pointers to positions in the "probes" list which is sorted
lists_of_zerofills=list()
lists_of_zerofills=list()
#print("dump in ZeroFillSpaces, StartProbeIndex, EndProbeIndex:"+str(StartProbeIndex)+","+str(EndProbeIndex))
#print("dump in ZeroFillSpaces, StartProbeIndex, EndProbeIndex:"+str(StartProbeIndex)+","+str(EndProbeIndex))
if StartProbeIndex< 0 or StartProbeIndex>= len(probes):
if StartProbeIndex< 0 or StartProbeIndex>= len(probes):
print("Invalid StartProbeIndex in function ZeroFillSpaces: "+str(StartProbeIndex))
print("Invalid StartProbeIndex in function ZeroFillSpaces: "+str(StartProbeIndex))
quit(767)
quit(767)
elif EndProbeIndex <0 or EndProbeIndex>=len(probes):
elif EndProbeIndex <0 or EndProbeIndex>=len(probes):
print("Invalid EndProbeIndex in function ZeroFillSpaces: "+str(EndProbeIndex))
print("Invalid EndProbeIndex in function ZeroFillSpaces: "+str(EndProbeIndex))
quit(768)
quit(768)
elif StartProbeIndex>=EndProbeIndex:
elif StartProbeIndex>=EndProbeIndex:
return lists_of_zerofills
return lists_of_zerofills
else:
else:
pass
pass
stChr=probes[StartProbeIndex][0]
stChr=probes[StartProbeIndex][0]
stBase=probes[StartProbeIndex][1]
stBase=probes[StartProbeIndex][1]
endChr=probes[EndProbeIndex][0]
endChr=probes[EndProbeIndex][0]
endBase=probes[EndProbeIndex][1]
endBase=probes[EndProbeIndex][1]
while stChr<=endChr:
while stChr<=endChr:
if stChr==endChr:
if stChr==endChr:
if stBase<endBase:
if stBase<endBase:
n=0
n=0
lists_of_zerofills.append([stChr,stBase,endBase,0.0,0.0,0.0,0.0])
lists_of_zerofills.append([stChr,stBase,endBase,0.0,0.0,0.0,0.0])
else:
else:
break
break
else:
else:
endMark=LastMarkersonChromosomes[stChr]
endMark=LastMarkersonChromosomes[stChr]
lists_of_zerofills.append([stChr,stBase,endMark,0.0,0.0,0.0,0.0])
lists_of_zerofills.append([stChr,stBase,endMark,0.0,0.0,0.0,0.0])
#move on to next chromosome
#move on to next chromosome
stChr+=1
stChr+=1
if stChr>24:
if stChr>24:
break
break
else:
else:
stBase=FirstMarkersonChromosomes[stChr]
stBase=FirstMarkersonChromosomes[stChr]
return lists_of_zerofills
return lists_of_zerofills
def getCytobands(chromosome,startn,endn):
def getCytobands(chromosome,startn,endn):
#note: chromosome is 1-22 and X or Y
#note: chromosome is 1-22 and X or Y
try:
try:
## cnx = mysql.connector.connect(user='genome', host='genome-mysql.cse.ucsc.edu',
## cnx = mysql.connector.connect(user='genome', host='genome-mysql.cse.ucsc.edu',
## database='hg19')
## database='hg19')
cnx = mysql.connector.connect(user='root', password='ssqll', host='127.0.0.1',port='3306',
cnx = mysql.connector.connect(user='root', password='ssqll', host='127.0.0.1',port='3306',
database='ucsc_hg18')
database='ucsc_hg18')
except mysql.connector.Error as err:
except mysql.connector.Error as err:
print("Error occured during obtaining cytoband information from local database.")
print("Error occured during obtaining cytoband information from local database.")
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("Something is wrong with your user name or password")
print("Something is wrong with your user name or password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("Database does not exists")
print("Database does not exists")
else:
else:
print(err)
print(err)
cursor = cnx.cursor()
cursor = cnx.cursor()
query = ("SELECT name"
query = ("SELECT name"
" FROM cytoBand "
" FROM cytoBand "
" WHERE chrom=\"chr"+str(chromosome)+"\" "
" WHERE chrom=\"chr"+str(chromosome)+"\" "
" and ((chromStart<="+str(endn)+" AND chromEndStart>="+str(startn)+")"
" and ((chromStart<="+str(endn)+" AND chromEndStart>="+str(startn)+")"
" ORDER BY name")
" ORDER BY name")
#print(query)
#print(query)
cursor.execute(query)
cursor.execute(query)
cytobands=''
cytobands=''
for bandname in cursor:
for bandname in cursor:
cytobands=cytobands+''+bandname
cytobands=cytobands+''+bandname
cursor.close()
cursor.close()
cnx.close()
cnx.close()
return cytobands
return cytobands
Copiar
Copiado
Copiar
Copiado
def getmatrix(region,SampleSegments,ControlSegments,codemap,arraymap):
def getmatrix(region,SampleSegments,ControlSegments,codemap,arraymap):
#region is a peak, as a list, in order: chromosome, start, end
#region is a peak, as a list, in order: chromosome, start, end
peak=region
peak=region
#first initiate codemap to 0
#first initiate codemap to 0
for x in codemap.keys():
for x in codemap.keys():
codemap[x]=[0,0]
codemap[x]=[0,0]
for Array in SampleSegments:
for Array in SampleSegments:
cumulativelength=0
cumulativelength=0
for row in Array[1]:
for row in Array[1]:
if row[1]==peak[0]:#same chromosome
if row[1]==peak[0]:#same chromosome
segA=[peak[1],peak[2]]
segA=[peak[1],peak[2]]
segB=[row[2],row[3]]
segB=[row[2],row[3]]
re=RelationshipOf(segA,segB)
re=RelationshipOf(segA,segB)
if re=="no relationship":#do nothing "no relationship"
if re=="no relationship":#do nothing "no relationship"
#print("goes here")
#print("goes here")
pass
pass
elif re=="in" or re=="equal":
elif re=="in" or re=="equal":
codemap[arraymap[Array[0]]][0]=1
codemap[arraymap[Array[0]]][0]=1
break #this sample has been counted, so no need to check other segments in this sample
break #this sample has been counted, so no need to check other segments in this sample
elif re=="cover":
elif re=="cover":
sa=segA[1]-segA[0]
sa=segA[1]-segA[0]
sb=segB[1]-segB[0]
sb=segB[1]-segB[0]
if 10*sb+cumulativelength>sa: #set the limit how small segB can be compared to segA in "cover"
if 10*sb+cumulativelength>sa: #set the limit how small segB can be compared to segA in "cover"
codemap[arraymap[Array[0]]][0]=1
codemap[arraymap[Array[0]]][0]=1
break #this sample has been counted, so no need to check other segments in this sample
break #this sample has been counted, so no need to check other segments in this sample
else:
else:
cumulativelength+=sb
cumulativelength+=sb
#print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.))
#print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.))
elif re=="overlap":
elif re=="overlap":
sa=segA[1]-segA[0]
sa=segA[1]-segA[0]
sb=segB[1]-segB[0]
sb=segB[1]-segB[0]
if segA[0]<segB[0]: #left overlap
if segA[0]<segB[0]: #left overlap
sb_overlap=segA[1]-segB[0]
sb_overlap=segA[1]-segB[0]
sb_outside=segB[1]-segA[1]
sb_outside=segB[1]-segA[1]
else: #right overlap
else: #right overlap
sb_overlap=segB[1]-segA[0]
sb_overlap=segB[1]-segA[0]
sb_outside=segA[0]-segA[0]
sb_outside=segA[0]-segA[0]
#if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap
#if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap
if cumulativelength+10*sb_overlap>sa: #test of substantial overlap
if cumulativelength+10*sb_overlap>sa: #test of substantial overlap
codemap[arraymap[Array[0]]][0]=1
codemap[arraymap[Array[0]]][0]=1
break #this sample has been counted, so no need to check other segments in this sample
break #this sample has been counted, so no need to check other segments in this sample
else:
else:
cumulativelength+=sb_overlap
cumulativelength+=sb_overlap
#print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.))
#print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.))
else:
else:
print("Strange relationship: "+re)
print("Strange relationship: "+re)
exit(436)
exit(436)
for Array in ControlSegments:
for Array in ControlSegments:
cumulativelength=0
cumulativelength=0
for row in Array[1]:
for row in Array[1]:
if row[1]==peak[0]:#same chromosome
if row[1]==peak[0]:#same chromosome
segA=[peak[1],peak[2]]
segA=[peak[1],peak[2]]
segB=[row[2],row[3]]
segB=[row[2],row[3]]
re=RelationshipOf(segA,segB)
re=RelationshipOf(segA,segB)
if re=="no relationship":#do nothing
if re=="no relationship":#do nothing
pass
pass
elif re=="in" or re=="equal":
elif re=="in" or re=="equal":
codemap[arraymap[Array[0]]][1]=1
codemap[arraymap[Array[0]]][1]=1
break #this Control has been counted, so no need to check other segments in this Control
break #this Control has been counted, so no need to check other segments in this Control
elif re=="cover":
elif re=="cover":
sa=segA[1]-segA[0]
sa=segA[1]-segA[0]
sb=segB[1]-segB[0]
sb=segB[1]-segB[0]
if 10*sb+cumulativelength>sa: #set the limit how small segB can be compared to segA in "cover"
if 10*sb+cumulativelength>sa: #set the limit how small segB can be compared to segA in "cover"
codemap[arraymap[Array[0]]][1]=1
codemap[arraymap[Array[0]]][1]=1
break #this Control has been counted, so no need to check other segments in this Control
break #this Control has been counted, so no need to check other segments in this Control
else:
else:
cumulativelength+=sb
cumulativelength+=sb
#print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.))
#print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.))
elif re=="overlap":
elif re=="overlap":
sa=segA[1]-segA[0]
sa=segA[1]-segA[0]
sb=segB[1]-segB[0]
sb=segB[1]-segB[0]
if segA[0]<segB[0]: #left overlap
if segA[0]<segB[0]: #left overlap
sb_overlap=segA[1]-segB[0]
sb_overlap=segA[1]-segB[0]
sb_outside=segB[1]-segA[1]
sb_outside=segB[1]-segA[1]
else: #right overlap
else: #right overlap
sb_overlap=segB[1]-segA[0]
sb_overlap=segB[1]-segA[0]
sb_outside=segA[0]-segA[0]
sb_outside=segA[0]-segA[0]
#if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap
#if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap
if cumulativelength+10*sb_overlap>sa: #test of substantial overlap
if cumulativelength+10*sb_overlap>sa: #test of substantial overlap
codemap[arraymap[Array[0]]][1]=1
codemap[arraymap[Array[0]]][1]=1
break #this Control has been counted, so no need to check other segments in this Control
break #this Control has been counted, so no need to check other segments in this Control
else:
else:
cumulativelength+=sb_overlap
cumulativelength+=sb_overlap
#print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.))
#print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.))
else:
else:
print("Strange relationship: "+re)
print("Strange relationship: "+re)
exit(436)
exit(436)
#calculate a,b,c,d
#calculate a,b,c,d
a=b=c=d=0
a=b=c=d=0
for x in codemap.keys():
for x in codemap.keys():
if codemap[x][0]==1 and codemap[x][1]==1:# + +
if codemap[x][0]==1 and codemap[x][1]==1:# + +
a+=1
a+=1
elif codemap[x][0]==1 and codemap[x][1]==0:# + -
elif codemap[x][0]==1 and codemap[x][1]==0:# + -
b+=1
b+=1
elif codemap[x][0]==0 and codemap[x][1]==1:# + -
elif codemap[x][0]==0 and codemap[x][1]==1:# + -
c+=1
c+=1
elif codemap[x][0]==0 and codemap[x][1]==0:# + -
elif codemap[x][0]==0 and codemap[x][1]==0:# + -
d+=1
d+=1
return [a,b,c,d]
return [a,b,c,d]
Copiar
Copiado
Copiar
Copiado
# main execution starts here
# main execution starts here
import os
import os
import os.path
import os.path
import csv
import csv
import shutil
import shutil
import time
import time
import operator
import operator
import copy
import copy
import scipy
import scipy
import math
import math
from scipy import stats
from scipy import stats
import mysql.connector
import mysql.connector
from mysql.connector import errorcode
from mysql.connector import errorcode
startclock=time.clock()
startclock=time.clock()
print ("\n"*5)
print ("\n"*5)
print("The program starts at: "+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
print("The program starts at: "+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
# read probes
# read probes
print("Now read probes.")
print("Now read probes.")
ifile = open(probefile, "r")
ifile = open(probefile, "r")
reader = csv.reader(ifile,delimiter='\t')
reader = csv.reader(ifile,delimiter='\t')
for row in reader:
for row in reader:
try:
try:
n=0
n=0
if row[1]=='X':
if row[1]=='X':
#print("probes on X found")
#print("probes on X found")
n=23
n=23
elif row[1]=='Y':
elif row[1]=='Y':
n=24
n=24
elif row[1]=='MT':#ignore MT
elif row[1]=='MT':#ignore MT
continue
continue
else:
else:
n=int(row[1])
n=int(row[1])
#note: start size position has been added by 1 as below!!
#note: start size position has been added by 1 as below!!
probes.append([n,int(row[2])])
probes.append([n,int(row[2])])
except:
except:
print(row)
print(row)
quit(963)
quit(963)
ifile.close()
ifile.close()
probes=sorted(probes,key=operator.itemgetter(0,1))
probes=sorted(probes,key=operator.itemgetter(0,1))
#initiate FirstMarkers and LastMarkers on chromosomes
#initiate FirstMarkers and LastMarkers on chromosomes
##FirstMarkersonChromosomes=dict()#First markers of chromosomes
##FirstMarkersonChromosomes=dict()#First markers of chromosomes
##LastMarkersonChromosomes=dict() #Last markers of chromosomes
##LastMarkersonChromosomes=dict() #Last markers of chromosomes
n=0
n=0
for index,p in enumerate(probes):
for index,p in enumerate(probes):
if p[0]!=n: #first occation of a chromosome
if p[0]!=n: #first occation of a chromosome
FirstMarkersonChromosomes[p[0]]=(p[1])
FirstMarkersonChromosomes[p[0]]=(p[1])
if n!=0: #not just start
if n!=0: #not just start
LastMarkersonChromosomes[probes[index-1][0]]=probes[index-1][1]
LastMarkersonChromosomes[probes[index-1][0]]=probes[index-1][1]
n=p[0]
n=p[0]
LastMarkersonChromosomes[probes[-1][0]]=probes[-1][1] #for Y chromosome, which is the last element in the probes list
LastMarkersonChromosomes[probes[-1][0]]=probes[-1][1] #for Y chromosome, which is the last element in the probes list
SegmentsDict=dict()
SegmentsDict=dict()
ScoresDict=dict()
ScoresDict=dict()
sampleinformationfile=r'C:\Users\ly091\Documents\FixedOPtimalPairs_LY7.15.txt'
sampleinformationfile=r'C:\Users\ly091\Documents\FixedOPtimalPairs_LY7.15.txt'
for loopcontrol in range(1,4):
for loopcontrol in range(1,4):
if loopcontrol ==1:
if loopcontrol ==1:
#FamilialTumor-FamilialGermline
#FamilialTumor-FamilialGermline
SampleSize99=ControlSize99=42
SampleSize99=ControlSize99=42
Copiar
Copiado
Copiar
Copiado
Samples = r'C:\Users\ly091\Documents\result8\
Loss
\FamilialTumor.
Loss
.centroless.seg'
Samples = r'C:\Users\ly091\Documents\result8\
Gain
\FamilialTumor.
Gain
.centroless.seg'
Controls= r'C:\Users\ly091\Documents\result8\
Loss
\FamilialGermline.
Loss
.centroless.seg'
Controls= r'C:\Users\ly091\Documents\result8\
Gain
\FamilialGermline.
Gain
.centroless.seg'
outputfile=r'C:\Users\ly091\Documents\result8\
Loss
\FamilialTumor-FamiliarGermline.
Loss
.mc.gistic'
outputfile=r'C:\Users\ly091\Documents\result8\
Gain
\FamilialTumor-FamiliarGermline.
Gain
.mc.gistic'
StatisticsOutput=r'C:\Users\ly091\Documents\result8\
Loss
\FamilialTumor-FamiliarGermline.
Loss
.mc.statistics.txt'
StatisticsOutput=r'C:\Users\ly091\Documents\result8\
Gain
\FamilialTumor-FamiliarGermline.
Gain
.mc.statistics.txt'
patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\FamilialTumor-FamilialGermline.codes.txt'
patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\FamilialTumor-FamilialGermline.codes.txt'
elif loopcontrol ==2:
elif loopcontrol ==2:
#SporadicTumor - SporadicGermline
#SporadicTumor - SporadicGermline
SampleSize99=ControlSize99=91
SampleSize99=ControlSize99=91
Copiar
Copiado
Copiar
Copiado
Samples = r'C:\Users\ly091\Documents\result8\
Loss
\sporadicTumor.
Loss
.centroless.seg'
Samples = r'C:\Users\ly091\Documents\result8\
Gain
\sporadicTumor.
Gain
.centroless.seg'
Controls= r'C:\Users\ly091\Documents\result8\
Loss
\sporadicGermline.
Loss
.centroless.seg'
Controls= r'C:\Users\ly091\Documents\result8\
Gain
\sporadicGermline.
Gain
.centroless.seg'
outputfile=r'C:\Users\ly091\Documents\result8\
Loss
\SporadicTumor-SporadicGermline.
Loss
.mc.gistic'
outputfile=r'C:\Users\ly091\Documents\result8\
Gain
\SporadicTumor-SporadicGermline.
Gain
.mc.gistic'
StatisticsOutput=r'C:\Users\ly091\Documents\result8\
Loss
\SporadicTumor-SporadicGermline.
Loss
.mc.statistics.txt'
StatisticsOutput=r'C:\Users\ly091\Documents\result8\
Gain
\SporadicTumor-SporadicGermline.
Gain
.mc.statistics.txt'
patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\SporadicTumor-SporadicGermline.codes.txt'
patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\SporadicTumor-SporadicGermline.codes.txt'
elif loopcontrol ==3:
elif loopcontrol ==3:
#Tumor-Germline
#Tumor-Germline
SampleSize99=ControlSize99=133
SampleSize99=ControlSize99=133
Copiar
Copiado
Copiar
Copiado
Samples = r'C:\Users\ly091\Documents\result8\
Loss
\tumor.
Loss
.centroless.seg'
Samples = r'C:\Users\ly091\Documents\result8\
Gain
\tumor.
Gain
.centroless.seg'
Controls= r'C:\Users\ly091\Documents\result8\
Loss
\germline.
Loss
.centroless.seg'
Controls= r'C:\Users\ly091\Documents\result8\
Gain
\germline.
Gain
.centroless.seg'
outputfile=r'C:\Users\ly091\Documents\result8\
Loss
\Tumor-Germline.
Loss
.mc.gistic'
outputfile=r'C:\Users\ly091\Documents\result8\
Gain
\Tumor-Germline.
Gain
.mc.gistic'
StatisticsOutput=r'C:\Users\ly091\Documents\result8\
Loss
\Tumor-Germline.
Loss
.mc.statistics.txt'
StatisticsOutput=r'C:\Users\ly091\Documents\result8\
Gain
\Tumor-Germline.
Gain
.mc.statistics.txt'
patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\Tumor-Germline.codes.txt'
patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\Tumor-Germline.codes.txt'
else:
else:
print("Error in control.")
print("Error in control.")
exit(444)
exit(444)
print("Now build the code map and the array map...")
print("Now build the code map and the array map...")
#build the code map
#build the code map
codemap=dict()#to store the sample information file
codemap=dict()#to store the sample information file
ifile = open(patientcodes, "r")
ifile = open(patientcodes, "r")
reader = csv.reader(ifile,delimiter='\t')
reader = csv.reader(ifile,delimiter='\t')
for row in reader:
for row in reader:
if len(row)==0:
if len(row)==0:
pass
pass
else:
else:
codemap[row[0]]=[0,0]#this is initialized to store the sample and control of this patients have segment in a particular genomic region
codemap[row[0]]=[0,0]#this is initialized to store the sample and control of this patients have segment in a particular genomic region
#0 is no, 1 is yes
#0 is no, 1 is yes
ifile.close()
ifile.close()
#build the array map
#build the array map
arraymap=dict()
arraymap=dict()
ifile = open(sampleinformationfile, "r")
ifile = open(sampleinformationfile, "r")
reader = csv.reader(ifile,delimiter='\t')
reader = csv.reader(ifile,delimiter='\t')
rownum = 0
rownum = 0
for row in reader:
for row in reader:
if rownum == 0:
if rownum == 0:
rownum+=1
rownum+=1
if row[17]!='Sample2':
if row[17]!='Sample2':
print("File error!")
print("File error!")
quit(883)
quit(883)
else:
else:
sample2=row[17]
sample2=row[17]
if sample2 in codemap:
if sample2 in codemap:
arraymap[row[0].split('.')[0]]=sample2
arraymap[row[0].split('.')[0]]=sample2
else:
else:
arraymap[row[0].split('.')[0]]='unused' #this is a tag for arrays that should not be used.
arraymap[row[0].split('.')[0]]='unused' #this is a tag for arrays that should not be used.
ifile.close()
ifile.close()
if Samples in SegmentsDict:
if Samples in SegmentsDict:
print("\n*** Use saved results for "+Samples +" ***\n")
print("\n*** Use saved results for "+Samples +" ***\n")
SampleSegments=copy.deepcopy(SegmentsDict[Samples])
SampleSegments=copy.deepcopy(SegmentsDict[Samples])
SampleScores=copy.deepcopy(ScoresDict[Samples])
SampleScores=copy.deepcopy(ScoresDict[Samples])
else:
else:
#read Samples:
#read Samples:
print("Now read Samples.")
print("Now read Samples.")
SampleSegments=list()
SampleSegments=list()
allsegments_unsorted=list()
allsegments_unsorted=list()
ifile = open(Samples, "r")
ifile = open(Samples, "r")
reader = csv.reader(ifile,delimiter='\t')
reader = csv.reader(ifile,delimiter='\t')
rownum = 0
rownum = 0
ndebug=0
ndebug=0
for row in reader:
for row in reader:
if rownum == 0:
if rownum == 0:
rownum+=1
rownum+=1
else:
else:
chromosome=0
chromosome=0
if row[1]=='X':
if row[1]=='X':
chromosome=23
chromosome=23
elif row[1]=='Y':
elif row[1]=='Y':
chromosome=24
chromosome=24
else:
else:
chromosome=int(row[1])
chromosome=int(row[1])
# if chromosome==4:
# if chromosome==4:
if True:
if True:
if ndebug>Recordstoread:
if ndebug>Recordstoread:
print("Warning: Sample file is not read to the end.")
print("Warning: Sample file is not read to the end.")
break
break
if arraymap[row[0]]!='unused':
if arraymap[row[0]]!='unused':
ndebug+=1
ndebug+=1
allsegments_unsorted.append([row[0],chromosome, int(row[2]), int(row[3])])
allsegments_unsorted.append([row[0],chromosome, int(row[2]), int(row[3])])
ifile.close()
ifile.close()
allsegments=sorted(allsegments_unsorted,key=operator.itemgetter(0,1,2,3))
allsegments=sorted(allsegments_unsorted,key=operator.itemgetter(0,1,2,3))
array=''
array=''
for row in allsegments:
for row in allsegments:
if row[0]!=array:
if row[0]!=array:
array=row[0]
array=row[0]
newarray=[array,[row]]
newarray=[array,[row]]
SampleSegments.append(newarray)#order: array, [array, chromosome, start, end]
SampleSegments.append(newarray)#order: array, [array, chromosome, start, end]
else:
else:
SampleSegments[-1][1].append(row)
SampleSegments[-1][1].append(row)
SampleSegmentsBackup=copy.deepcopy(SampleSegments)#order: array, [array, chromosome, start, end], it is sorted
SampleSegmentsBackup=copy.deepcopy(SampleSegments)#order: array, [array, chromosome, start, end], it is sorted
cp=0
cp=0
SliceA=[]
SliceA=[]
SliceB=[]
SliceB=[]
pt=list() #list of pointers to the currently working segment in SampleSegments,
pt=list() #list of pointers to the currently working segment in SampleSegments,
#Calculate G-scores for Samples
#Calculate G-scores for Samples
SampleScores=list() #chromosome, start, end, p-value, G-score
SampleScores=list() #chromosome, start, end, p-value, G-score
#remember this is for LOH
#remember this is for LOH
print("Now Calculate G-Scores for Samples.")
print("Now Calculate G-Scores for Samples.")
#print(len(SampleSegments))
#print(len(SampleSegments))
for row in SampleSegments:
for row in SampleSegments:
pt.append(0)#initiate the pointers to 0
pt.append(0)#initiate the pointers to 0
#Find the slice
#Find the slice
while True:
while True:
#pre-set SliceA
#pre-set SliceA
#print("main while")
#print("main while")
empty=True
empty=True
for index, row in enumerate(SampleSegments):
for index, row in enumerate(SampleSegments):
if pt[index]<len(row[1]):
if pt[index]<len(row[1]):
## print(row[1][pt[index]])
## print(row[1][pt[index]])
lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2]
lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2]
if lengthofsegment<2000000:#ignore very long segments:
if lengthofsegment<2000000:#ignore very long segments:
SliceA=row[1][pt[index]][:] #order: array, chromosome, start, end, just ignore array
SliceA=row[1][pt[index]][:] #order: array, chromosome, start, end, just ignore array
empty=False
empty=False
break
break
#print(empty)
#print(empty)
if empty:
if empty:
break
break
## print("\nBefore refinement,SliceA"+str(SliceA))
## print("\nBefore refinement,SliceA"+str(SliceA))
#refine SliceA
#refine SliceA
for index,row in enumerate(SampleSegments):
for index,row in enumerate(SampleSegments):
if pt[index]<len(row[1]):
if pt[index]<len(row[1]):
lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2]
lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2]
## print(lengthofsegment)
## print(lengthofsegment)
if lengthofsegment>=2000000:#ignore very long segments
if lengthofsegment>=2000000:#ignore very long segments
# pt[index]+=1
# pt[index]+=1
continue
continue
SliceB=row[1][pt[index]][:]
SliceB=row[1][pt[index]][:]
## print("\nIn refinement,SliceA:"+str(SliceA))
## print("\nIn refinement,SliceA:"+str(SliceA))
## print("In refinement, SliceB:"+str(SliceB))
## print("In refinement, SliceB:"+str(SliceB))
if SliceA[1]<SliceB[1]: #A is before B in different chromosomes
if SliceA[1]<SliceB[1]: #A is before B in different chromosomes
pass
pass
elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes
elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes
SliceA=SliceB[:]
SliceA=SliceB[:]
pass
pass
else: #same chromosome
else: #same chromosome
Chrom=SliceA[1]
Chrom=SliceA[1]
segA=SliceA[2:4]
segA=SliceA[2:4]
segB=SliceB[2:4]
segB=SliceB[2:4]
re=RelationshipOf(segA,segB)
re=RelationshipOf(segA,segB)
if re=='equal':
if re=='equal':
pass
pass
elif re=='no relationship':
elif re=='no relationship':
if segA[0]<segB[0]: #A is before B
if segA[0]<segB[0]: #A is before B
pass
pass
else: #A is after B
else: #A is after B
SliceA=SliceB[:]
SliceA=SliceB[:]
elif re=='cover':
elif re=='cover':
if SliceA[2]<SliceB[2]:
if SliceA[2]<SliceB[2]:
SliceA[3]=SliceB[2]
SliceA[3]=SliceB[2]
elif SliceA[2]==SliceB[2]:
elif SliceA[2]==SliceB[2]:
SliceA=SliceB[:]
SliceA=SliceB[:]
else:
else:
print("This is impossible!")
print("This is impossible!")
exit(388)
exit(388)
elif re=='in':
elif re=='in':
if SliceA[2]>SliceB[2]:
if SliceA[2]>SliceB[2]:
SliceA[2]=segB[0]
SliceA[2]=segB[0]
SliceA[3]=segA[0]
SliceA[3]=segA[0]
elif SliceA[2]==SliceB[2]:
elif SliceA[2]==SliceB[2]:
pass
pass
else:
else:
print("This is impossible!")
print("This is impossible!")
exit(554)
exit(554)
elif re=='overlap':
elif re=='overlap':
if SliceA[2]<SliceB[2]: #A before B
if SliceA[2]<SliceB[2]: #A before B
SliceA[3]=SliceB[2]
SliceA[3]=SliceB[2]
else: #A after B
else: #A after B
SliceA[3]=SliceA[2]
SliceA[3]=SliceA[2]
SliceA[2]=SliceB[2]
SliceA[2]=SliceB[2]
else:
else:
print(re)
print(re)
print("This is impossible!")
print("This is impossible!")
exit(323)
exit(323)
#use SliceA to go through all segments
#use SliceA to go through all segments
if cp%EchoPerLines==0:
if cp%EchoPerLines==0:
## if True:
## if True:
print(str(cp)+": Now working on Samples. SliceA after refinery: "+str(SliceA))
print(str(cp)+": Now working on Samples. SliceA after refinery: "+str(SliceA))
cp+=1
cp+=1
SumSample=0
SumSample=0
for index, row in enumerate(SampleSegments):
for index, row in enumerate(SampleSegments):
if pt[index]<len(row[1]):
if pt[index]<len(row[1]):
SliceB=row[1][pt[index]]#segment to be tested
SliceB=row[1][pt[index]]#segment to be tested
if SliceB[2]==SliceB[3]:#remove 0 length segments from the que.
if SliceB[2]==SliceB[3]:#remove 0 length segments from the que.
pt[index]+=1
pt[index]+=1
continue
continue
## print("\nSliceA: "+str(SliceA))
## print("\nSliceA: "+str(SliceA))
## print("SliceB: "+str(SliceB))
## print("SliceB: "+str(SliceB))
if SliceA[1]<SliceB[1]: #A is before B in different chromosomes
if SliceA[1]<SliceB[1]: #A is before B in different chromosomes
## print("A is before B in different chromosomes.")
## print("A is before B in different chromosomes.")
continue
continue
elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes
elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes
pt[index]+=1
pt[index]+=1
continue
continue
else: #same chromosome
else: #same chromosome
Copiar
Copiado
Copiar
Copiado
Chrom=Slice
Chrom=Slice
A[1]
Diferencias guardadas
Texto original
Abrir archivo
#LOH probefile=r'C:\Users\ly091\Documents\probes_SNP6.0_hg18.txt' ##SampleScoreFile = r'C:\Users\ly091\Documents\result8\Loss\tumor.Loss.Score.txt' ##ControlScoreFile = r'C:\Users\ly091\Documents\result8\Loss\Germline.Loss.Score.txt' GScoreThreshold=0.01#minimum Gscore to be recorded Recordstoread=10000000#for debug EchoPerLines=1000#verbose GaptoMerge=500#minimun width of a gap between two peaks to be merged, unit basepair MinimumPeakWidth=2000#minimum Peak Width, unit base pair pvaluelimit=0.01 #Global variables probes=list()#from the probe file, will be filled and sorted FirstMarkersonChromosomes=dict()#First markers of chromosomes LastMarkersonChromosomes=dict() #Last markers of chromosomes #Functions def mcnemar(a,b,c,d): """ Input args: a, b, c, d- frequencies (a: ++, b: +-, c: -+, d: --) b and c is interchangable Output: pvalue of test. """ if b+c>0: chi2testval = (abs(b-c) - 1)**2/ (b + c) df = 1 pvalue = 1 - stats.chi2.cdf(chi2testval, df) else: pvalue=1.0 return pvalue def AbeforeB(a,b): #compare postions of two probes, a and b contain: [chromosome, base pair position] if a[0]<b[0]: return True elif a[0]>b[0]: return False else: #case: in the same chromosome if a[1]<b[1]: return True elif a[1]>b[1]: return False else: return True #same position, return True here def LocateProbe(probe):#probe has: chromosome number, start size p1=0 p2=len(probes)-1 while True: pcenter=int((p1+p2)/2) #print("p1="+str(p1)+"\tp2="+str(p2)+"\tpcenter="+str(pcenter)) if probes[pcenter]==probe: return pcenter elif probes[p1]==probe: return p1 elif probes[p2]==probe: return p2 elif AbeforeB(probes[pcenter],probe):#take the right half if p1!=pcenter: p1=pcenter else: print("cannot find probe") print(str(probe)) exit(577) else: #take the left half if p2!=pcenter: p2=pcenter else: print("cannot find probe") print(str(probe)) exit(577) def LeftTarget(probe):#probe has: chromosome number, start size ## print(probe) p1=0 ## print(len(gtarget_numbers)) while p1<len(gtarget_numbers): if gtarget_numbers[p1][1]==probe[0]:#same chromosome #print("goes here") if gtarget_numbers[p1][2]>=probe[1]:#first meet, note gtarget_numbers is ordered if p1==0: return 0 else: return p1-1 p1+=1 if p1==len(gtarget_numbers): return -111111111111 def RightTarget(probe):#probe has: chromosome number, start size # print(probe) p1=1 while p1<len(gtarget_numbers): ## print(gtarget_numbers[-p1]) ## print(p1) ## print(probe) if gtarget_numbers[-p1][1]==probe[0]:#same chromosome if gtarget_numbers[-p1][3]<=probe[1]:#first meet from right, note gtarget_numbers is ordered if p1==1: return len(gtarget_numbers)-1 else: return len(gtarget_numbers)-p1+1 p1+=1 if p1==len(gtarget_numbers): return p1 def CountMarkers(nchr,nstart,nend): ## print("in countmarkers") ## print(nchr) ## print(nstart) ## print(nend) return LocateProbe([nchr,nend])-LocateProbe([nchr,nstart])+1 def PreviousProbeOf(element): #element is a list with chromosome and probe position, e.g.[2,242953866] n=LocateProbe(element)-1 if n<0: return 0 else: return n def NextProbeOf(element): n= LocateProbe(element)+1 if n>len(probes)-1: return len(probes)-1 else: return n def RelationshipOf(seg1,seg2): ## function RelationshipOf(seg1 as list, seg2 as list) ## compare two segments seg1 and seg2, ## seg1 and seg2 are two lists, each with the Min and Max values of the segment, ## Min should be the first value in the list and Max the second. ## return a string indicating the relationship between the two segments: ## "equal" : identical ## "no relationship" : seg1 and seg2 are not in the same locus, they don't overlap ## "in": seg1 is entirely within seg2 ## "cover": seg1 is bigger than seg2 and completely contains seg2 ## "overlap": seg1 partially overlaps with seg2 if seg1[0]==seg2[0] and seg1[1]==seg2[1]: return "equal" elif seg1[0]>=seg2[0] and seg1[1]<=seg2[1]: return "in" elif seg1[0]<=seg2[0] and seg1[1]>=seg2[1]: return "cover" if (seg1[0]<seg2[0] and seg1[1]<seg2[1] and seg1[1]>seg2[0]) or (seg1[0]>seg2[0] and seg1[1]>seg2[1] and seg1[0]<seg2[1]) : return "overlap" else : return "no relationship" def ZeroFillSpaces(StartProbeIndex,EndProbeIndex): #return a list of lists each is a properly formated segment with value of 0.0 #StartProbeINdex and EndProbeIndex are two pointers to positions in the "probes" list which is sorted lists_of_zerofills=list() #print("dump in ZeroFillSpaces, StartProbeIndex, EndProbeIndex:"+str(StartProbeIndex)+","+str(EndProbeIndex)) if StartProbeIndex< 0 or StartProbeIndex>= len(probes): print("Invalid StartProbeIndex in function ZeroFillSpaces: "+str(StartProbeIndex)) quit(767) elif EndProbeIndex <0 or EndProbeIndex>=len(probes): print("Invalid EndProbeIndex in function ZeroFillSpaces: "+str(EndProbeIndex)) quit(768) elif StartProbeIndex>=EndProbeIndex: return lists_of_zerofills else: pass stChr=probes[StartProbeIndex][0] stBase=probes[StartProbeIndex][1] endChr=probes[EndProbeIndex][0] endBase=probes[EndProbeIndex][1] while stChr<=endChr: if stChr==endChr: if stBase<endBase: n=0 lists_of_zerofills.append([stChr,stBase,endBase,0.0,0.0,0.0,0.0]) else: break else: endMark=LastMarkersonChromosomes[stChr] lists_of_zerofills.append([stChr,stBase,endMark,0.0,0.0,0.0,0.0]) #move on to next chromosome stChr+=1 if stChr>24: break else: stBase=FirstMarkersonChromosomes[stChr] return lists_of_zerofills def getCytobands(chromosome,startn,endn): #note: chromosome is 1-22 and X or Y try: ## cnx = mysql.connector.connect(user='genome', host='genome-mysql.cse.ucsc.edu', ## database='hg19') cnx = mysql.connector.connect(user='root', password='ssqll', host='127.0.0.1',port='3306', database='ucsc_hg18') except mysql.connector.Error as err: print("Error occured during obtaining cytoband information from local database.") if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: print("Something is wrong with your user name or password") elif err.errno == errorcode.ER_BAD_DB_ERROR: print("Database does not exists") else: print(err) cursor = cnx.cursor() query = ("SELECT name" " FROM cytoBand " " WHERE chrom=\"chr"+str(chromosome)+"\" " " and ((chromStart<="+str(endn)+" AND chromEndStart>="+str(startn)+")" " ORDER BY name") #print(query) cursor.execute(query) cytobands='' for bandname in cursor: cytobands=cytobands+''+bandname cursor.close() cnx.close() return cytobands def getmatrix(region,SampleSegments,ControlSegments,codemap,arraymap): #region is a peak, as a list, in order: chromosome, start, end peak=region #first initiate codemap to 0 for x in codemap.keys(): codemap[x]=[0,0] for Array in SampleSegments: cumulativelength=0 for row in Array[1]: if row[1]==peak[0]:#same chromosome segA=[peak[1],peak[2]] segB=[row[2],row[3]] re=RelationshipOf(segA,segB) if re=="no relationship":#do nothing "no relationship" #print("goes here") pass elif re=="in" or re=="equal": codemap[arraymap[Array[0]]][0]=1 break #this sample has been counted, so no need to check other segments in this sample elif re=="cover": sa=segA[1]-segA[0] sb=segB[1]-segB[0] if 10*sb+cumulativelength>sa: #set the limit how small segB can be compared to segA in "cover" codemap[arraymap[Array[0]]][0]=1 break #this sample has been counted, so no need to check other segments in this sample else: cumulativelength+=sb #print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)) elif re=="overlap": sa=segA[1]-segA[0] sb=segB[1]-segB[0] if segA[0]<segB[0]: #left overlap sb_overlap=segA[1]-segB[0] sb_outside=segB[1]-segA[1] else: #right overlap sb_overlap=segB[1]-segA[0] sb_outside=segA[0]-segA[0] #if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap if cumulativelength+10*sb_overlap>sa: #test of substantial overlap codemap[arraymap[Array[0]]][0]=1 break #this sample has been counted, so no need to check other segments in this sample else: cumulativelength+=sb_overlap #print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.)) else: print("Strange relationship: "+re) exit(436) for Array in ControlSegments: cumulativelength=0 for row in Array[1]: if row[1]==peak[0]:#same chromosome segA=[peak[1],peak[2]] segB=[row[2],row[3]] re=RelationshipOf(segA,segB) if re=="no relationship":#do nothing pass elif re=="in" or re=="equal": codemap[arraymap[Array[0]]][1]=1 break #this Control has been counted, so no need to check other segments in this Control elif re=="cover": sa=segA[1]-segA[0] sb=segB[1]-segB[0] if 10*sb+cumulativelength>sa: #set the limit how small segB can be compared to segA in "cover" codemap[arraymap[Array[0]]][1]=1 break #this Control has been counted, so no need to check other segments in this Control else: cumulativelength+=sb #print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)) elif re=="overlap": sa=segA[1]-segA[0] sb=segB[1]-segB[0] if segA[0]<segB[0]: #left overlap sb_overlap=segA[1]-segB[0] sb_outside=segB[1]-segA[1] else: #right overlap sb_overlap=segB[1]-segA[0] sb_outside=segA[0]-segA[0] #if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap if cumulativelength+10*sb_overlap>sa: #test of substantial overlap codemap[arraymap[Array[0]]][1]=1 break #this Control has been counted, so no need to check other segments in this Control else: cumulativelength+=sb_overlap #print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.)) else: print("Strange relationship: "+re) exit(436) #calculate a,b,c,d a=b=c=d=0 for x in codemap.keys(): if codemap[x][0]==1 and codemap[x][1]==1:# + + a+=1 elif codemap[x][0]==1 and codemap[x][1]==0:# + - b+=1 elif codemap[x][0]==0 and codemap[x][1]==1:# + - c+=1 elif codemap[x][0]==0 and codemap[x][1]==0:# + - d+=1 return [a,b,c,d] # main execution starts here import os import os.path import csv import shutil import time import operator import copy import scipy import math from scipy import stats import mysql.connector from mysql.connector import errorcode startclock=time.clock() print ("\n"*5) print("The program starts at: "+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) # read probes print("Now read probes.") ifile = open(probefile, "r") reader = csv.reader(ifile,delimiter='\t') for row in reader: try: n=0 if row[1]=='X': #print("probes on X found") n=23 elif row[1]=='Y': n=24 elif row[1]=='MT':#ignore MT continue else: n=int(row[1]) #note: start size position has been added by 1 as below!! probes.append([n,int(row[2])]) except: print(row) quit(963) ifile.close() probes=sorted(probes,key=operator.itemgetter(0,1)) #initiate FirstMarkers and LastMarkers on chromosomes ##FirstMarkersonChromosomes=dict()#First markers of chromosomes ##LastMarkersonChromosomes=dict() #Last markers of chromosomes n=0 for index,p in enumerate(probes): if p[0]!=n: #first occation of a chromosome FirstMarkersonChromosomes[p[0]]=(p[1]) if n!=0: #not just start LastMarkersonChromosomes[probes[index-1][0]]=probes[index-1][1] n=p[0] LastMarkersonChromosomes[probes[-1][0]]=probes[-1][1] #for Y chromosome, which is the last element in the probes list SegmentsDict=dict() ScoresDict=dict() sampleinformationfile=r'C:\Users\ly091\Documents\FixedOPtimalPairs_LY7.15.txt' for loopcontrol in range(1,4): if loopcontrol ==1: #FamilialTumor-FamilialGermline SampleSize99=ControlSize99=42 Samples = r'C:\Users\ly091\Documents\result8\Loss\FamilialTumor.Loss.centroless.seg' Controls= r'C:\Users\ly091\Documents\result8\Loss\FamilialGermline.Loss.centroless.seg' outputfile=r'C:\Users\ly091\Documents\result8\Loss\FamilialTumor-FamiliarGermline.Loss.mc.gistic' StatisticsOutput=r'C:\Users\ly091\Documents\result8\Loss\FamilialTumor-FamiliarGermline.Loss.mc.statistics.txt' patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\FamilialTumor-FamilialGermline.codes.txt' elif loopcontrol ==2: #SporadicTumor - SporadicGermline SampleSize99=ControlSize99=91 Samples = r'C:\Users\ly091\Documents\result8\Loss\sporadicTumor.Loss.centroless.seg' Controls= r'C:\Users\ly091\Documents\result8\Loss\sporadicGermline.Loss.centroless.seg' outputfile=r'C:\Users\ly091\Documents\result8\Loss\SporadicTumor-SporadicGermline.Loss.mc.gistic' StatisticsOutput=r'C:\Users\ly091\Documents\result8\Loss\SporadicTumor-SporadicGermline.Loss.mc.statistics.txt' patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\SporadicTumor-SporadicGermline.codes.txt' elif loopcontrol ==3: #Tumor-Germline SampleSize99=ControlSize99=133 Samples = r'C:\Users\ly091\Documents\result8\Loss\tumor.Loss.centroless.seg' Controls= r'C:\Users\ly091\Documents\result8\Loss\germline.Loss.centroless.seg' outputfile=r'C:\Users\ly091\Documents\result8\Loss\Tumor-Germline.Loss.mc.gistic' StatisticsOutput=r'C:\Users\ly091\Documents\result8\Loss\Tumor-Germline.Loss.mc.statistics.txt' patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\Tumor-Germline.codes.txt' else: print("Error in control.") exit(444) print("Now build the code map and the array map...") #build the code map codemap=dict()#to store the sample information file ifile = open(patientcodes, "r") reader = csv.reader(ifile,delimiter='\t') for row in reader: if len(row)==0: pass else: codemap[row[0]]=[0,0]#this is initialized to store the sample and control of this patients have segment in a particular genomic region #0 is no, 1 is yes ifile.close() #build the array map arraymap=dict() ifile = open(sampleinformationfile, "r") reader = csv.reader(ifile,delimiter='\t') rownum = 0 for row in reader: if rownum == 0: rownum+=1 if row[17]!='Sample2': print("File error!") quit(883) else: sample2=row[17] if sample2 in codemap: arraymap[row[0].split('.')[0]]=sample2 else: arraymap[row[0].split('.')[0]]='unused' #this is a tag for arrays that should not be used. ifile.close() if Samples in SegmentsDict: print("\n*** Use saved results for "+Samples +" ***\n") SampleSegments=copy.deepcopy(SegmentsDict[Samples]) SampleScores=copy.deepcopy(ScoresDict[Samples]) else: #read Samples: print("Now read Samples.") SampleSegments=list() allsegments_unsorted=list() ifile = open(Samples, "r") reader = csv.reader(ifile,delimiter='\t') rownum = 0 ndebug=0 for row in reader: if rownum == 0: rownum+=1 else: chromosome=0 if row[1]=='X': chromosome=23 elif row[1]=='Y': chromosome=24 else: chromosome=int(row[1]) # if chromosome==4: if True: if ndebug>Recordstoread: print("Warning: Sample file is not read to the end.") break if arraymap[row[0]]!='unused': ndebug+=1 allsegments_unsorted.append([row[0],chromosome, int(row[2]), int(row[3])]) ifile.close() allsegments=sorted(allsegments_unsorted,key=operator.itemgetter(0,1,2,3)) array='' for row in allsegments: if row[0]!=array: array=row[0] newarray=[array,[row]] SampleSegments.append(newarray)#order: array, [array, chromosome, start, end] else: SampleSegments[-1][1].append(row) SampleSegmentsBackup=copy.deepcopy(SampleSegments)#order: array, [array, chromosome, start, end], it is sorted cp=0 SliceA=[] SliceB=[] pt=list() #list of pointers to the currently working segment in SampleSegments, #Calculate G-scores for Samples SampleScores=list() #chromosome, start, end, p-value, G-score #remember this is for LOH print("Now Calculate G-Scores for Samples.") #print(len(SampleSegments)) for row in SampleSegments: pt.append(0)#initiate the pointers to 0 #Find the slice while True: #pre-set SliceA #print("main while") empty=True for index, row in enumerate(SampleSegments): if pt[index]<len(row[1]): ## print(row[1][pt[index]]) lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2] if lengthofsegment<2000000:#ignore very long segments: SliceA=row[1][pt[index]][:] #order: array, chromosome, start, end, just ignore array empty=False break #print(empty) if empty: break ## print("\nBefore refinement,SliceA"+str(SliceA)) #refine SliceA for index,row in enumerate(SampleSegments): if pt[index]<len(row[1]): lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2] ## print(lengthofsegment) if lengthofsegment>=2000000:#ignore very long segments # pt[index]+=1 continue SliceB=row[1][pt[index]][:] ## print("\nIn refinement,SliceA:"+str(SliceA)) ## print("In refinement, SliceB:"+str(SliceB)) if SliceA[1]<SliceB[1]: #A is before B in different chromosomes pass elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes SliceA=SliceB[:] pass else: #same chromosome Chrom=SliceA[1] segA=SliceA[2:4] segB=SliceB[2:4] re=RelationshipOf(segA,segB) if re=='equal': pass elif re=='no relationship': if segA[0]<segB[0]: #A is before B pass else: #A is after B SliceA=SliceB[:] elif re=='cover': if SliceA[2]<SliceB[2]: SliceA[3]=SliceB[2] elif SliceA[2]==SliceB[2]: SliceA=SliceB[:] else: print("This is impossible!") exit(388) elif re=='in': if SliceA[2]>SliceB[2]: SliceA[2]=segB[0] SliceA[3]=segA[0] elif SliceA[2]==SliceB[2]: pass else: print("This is impossible!") exit(554) elif re=='overlap': if SliceA[2]<SliceB[2]: #A before B SliceA[3]=SliceB[2] else: #A after B SliceA[3]=SliceA[2] SliceA[2]=SliceB[2] else: print(re) print("This is impossible!") exit(323) #use SliceA to go through all segments if cp%EchoPerLines==0: ## if True: print(str(cp)+": Now working on Samples. SliceA after refinery: "+str(SliceA)) cp+=1 SumSample=0 for index, row in enumerate(SampleSegments): if pt[index]<len(row[1]): SliceB=row[1][pt[index]]#segment to be tested if SliceB[2]==SliceB[3]:#remove 0 length segments from the que. pt[index]+=1 continue ## print("\nSliceA: "+str(SliceA)) ## print("SliceB: "+str(SliceB)) if SliceA[1]<SliceB[1]: #A is before B in different chromosomes ## print("A is before B in different chromosomes.") continue elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes pt[index]+=1 continue else: #same chromosome Chrom=SliceA[1] segA=SliceA[2:4] segB=SliceB[2:4] re=RelationshipOf(segA,segB) ## print(re) if re=='equal': SumSample=SumSample+1 #1.0 is for LOH only because no amplitude information is used for LOH pt[index]+=1 #move the pointer forward by 1 elif re=='no relationship': if segA[0]<segB[0]: #A is before B continue else: #A is after B pt[index]+=1 continue elif re=='cover': print("This is impossible!") print("\nSliceA: "+str(SliceA)) print("SliceB: "+str(SliceB)) exit(991) elif re=='in': SumSample=SumSample+1 #1.0 is for LOH only because no amplitude information is used for LOH if SliceB[3]>SliceA[3]: SliceB[2]=SliceA[3] else: pt[index]+=1 elif re=='overlap': SumSample+=1 if SliceA[2]<SliceB[2]: SliceB[2]=SliceA[3] else: pt[index]+=1 else: print("This is impossible!") exit(324) SliceA.append(0.0) SliceA.append(SumSample)#append p value and frequency of LOH ## print("After summation,SliceA"+str(SliceA)) SampleScores.append(SliceA) SampleSegments=SampleSegmentsBackup # SampleSegments restored, segments are sorted if Samples not in SegmentsDict: SegmentsDict[Samples]=copy.deepcopy(SampleSegments) ScoresDict[Samples]=copy.deepcopy(SampleScores) if Controls in SegmentsDict: print("\n*** Use saved results for "+Controls +" ***\n") ControlSegments=copy.deepcopy(SegmentsDict[Controls]) ControlScores=copy.deepcopy(ScoresDict[Controls]) else: #read Controls: print("Now read Controls.") ControlSegments=list() allsegments_unsorted=list() ifile = open(Controls, "r") reader = csv.reader(ifile,delimiter='\t') rownum = 0 ndebug=0 for row in reader: if rownum == 0: rownum+=1 else: chromosome=0 if row[1]=='X': chromosome=23 elif row[1]=='Y': chromosome=24 else: chromosome=int(row[1]) # if chromosome==4: if True: if ndebug>Recordstoread: print("Warning: Sample file is not read to the end.") break if arraymap[row[0]]!='unused': ndebug+=1 allsegments_unsorted.append([row[0],chromosome, int(row[2]), int(row[3])]) ifile.close() allsegments=sorted(allsegments_unsorted,key=operator.itemgetter(0,1,2,3)) array='' for row in allsegments: if row[0]!=array: array=row[0] newarray=[array,[row]] ControlSegments.append(newarray) else: ControlSegments[-1][1].append(row) ControlSegmentsBackup=copy.deepcopy(ControlSegments)#order: array, [array, chromosome, start, end], it is sorted cp=0 SliceA=[] SliceB=[] pt=list() #list of pointers to the currently working segment in ControlSegments, #Calculate G-scores for Controls ControlScores=list() #chromosome, start, end, p-value, G-score #remember this is for LOH print("Now Calculate G-Scores for Controls.") #print(len(ControlSegments)) for row in ControlSegments: pt.append(0)#initiate the pointers to 0 #Find the slice while True: #pre-set SliceA #print("main while") empty=True for index, row in enumerate(ControlSegments): if pt[index]<len(row[1]): ## print(row[1][pt[index]]) lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2] if lengthofsegment<2000000:#ignore very long segments: SliceA=row[1][pt[index]][:] #order: array, chromosome, start, end, just ignore array empty=False break #print(empty) if empty: break ## print("\nBefore refinement,SliceA"+str(SliceA)) #refine SliceA for index,row in enumerate(ControlSegments): if pt[index]<len(row[1]): lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2] ## print(lengthofsegment) if lengthofsegment>=2000000:#ignore very long segments # pt[index]+=1 continue SliceB=row[1][pt[index]][:] ## print("\nIn refinement,SliceA:"+str(SliceA)) ## print("In refinement, SliceB:"+str(SliceB)) if SliceA[1]<SliceB[1]: #A is before B in different chromosomes pass elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes SliceA=SliceB[:] pass else: #same chromosome Chrom=SliceA[1] segA=SliceA[2:4] segB=SliceB[2:4] re=RelationshipOf(segA,segB) if re=='equal': pass elif re=='no relationship': if segA[0]<segB[0]: #A is before B pass else: #A is after B SliceA=SliceB[:] elif re=='cover': if SliceA[2]<SliceB[2]: SliceA[3]=SliceB[2] elif SliceA[2]==SliceB[2]: SliceA=SliceB[:] else: print("This is impossible!") exit(388) elif re=='in': if SliceA[2]>SliceB[2]: SliceA[2]=segB[0] SliceA[3]=segA[0] elif SliceA[2]==SliceB[2]: pass else: print("This is impossible!") exit(554) elif re=='overlap': if SliceA[2]<SliceB[2]: #A before B SliceA[3]=SliceB[2] else: #A after B SliceA[3]=SliceA[2] SliceA[2]=SliceB[2] else: print(re) print("This is impossible!") exit(323) #use SliceA to go through all segments if cp%EchoPerLines==0: ## if True: print(str(cp)+": Now working on Controls. SliceA after refinery: "+str(SliceA)) cp+=1 SumControl=0 for index, row in enumerate(ControlSegments): if pt[index]<len(row[1]): SliceB=row[1][pt[index]]#segment to be tested if SliceB[2]==SliceB[3]:#remove 0 length segments from the que. pt[index]+=1 continue ## print("\nSliceA: "+str(SliceA)) ## print("SliceB: "+str(SliceB)) if SliceA[1]<SliceB[1]: #A is before B in different chromosomes ## print("A is before B in different chromosomes.") continue elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes pt[index]+=1 continue else: #same chromosome Chrom=SliceA[1] segA=SliceA[2:4] segB=SliceB[2:4] re=RelationshipOf(segA,segB) ## print(re) if re=='equal': SumControl=SumControl+1 #1.0 is for LOH only because no amplitude information is used for LOH pt[index]+=1 #move the pointer forward by 1 elif re=='no relationship': if segA[0]<segB[0]: #A is before B continue else: #A is after B pt[index]+=1 continue elif re=='cover': print("This is impossible!") print("\nSliceA: "+str(SliceA)) print("SliceB: "+str(SliceB)) exit(991) elif re=='in': SumControl=SumControl+1 #1.0 is for LOH only because no amplitude information is used for LOH if SliceB[3]>SliceA[3]: SliceB[2]=SliceA[3] else: pt[index]+=1 elif re=='overlap': SumControl+=1 if SliceA[2]<SliceB[2]: SliceB[2]=SliceA[3] else: pt[index]+=1 else: print("This is impossible!") exit(324) SliceA.append(0.0) SliceA.append(SumControl)#append p value and frequency of LOH ## print("After summation,SliceA"+str(SliceA)) ControlScores.append(SliceA) ControlSegments=ControlSegmentsBackup # ControlSegments restored, segments are sorted if Controls not in SegmentsDict: SegmentsDict[Controls]=copy.deepcopy(ControlSegments) ScoresDict[Controls]=copy.deepcopy(ControlScores) #Merge SampleScores and ControlScores, calculate subtracted G-score, calculate p-value using Mcnemar's test nSamples=len(SampleSegments) nControls=len(ControlSegments) FinalScores=list() ## The 2x2 contigency table for Fisher's Exact test ## ## with CNA without CNA ## Sample a b ## Control c d print("\nNow merge Samples and Controls.") while len(SampleScores)>0 and len(ControlScores)>0: SliceA=SampleScores.pop(0) #order: array, chromosome, start, end, p-value, sum, just ignore array SliceB=ControlScores.pop(0) #The naming of SliceA and SliceB is for convenience if SliceA[1]<SliceB[1]: #A is before B in different chromosomes #def getmatrix(region,SampleSegments,ControlSegments,codemap,arraymap): region=SliceA[1:4] ## print(region) M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap) ## print(M) a=M[0] b=M[1] c=M[2] d=M[3] pvalue=mcnemar(a,b,c,d) logp=-math.log10(pvalue) ## if logp<0.000001: ## print("a="+str(a)+", b="+str(b)+", c="+str(c)+", d="+str(d)) newscore=SliceA[1:4] newscore.append(logp) x=SliceA[5]*1.0/nSamples newscore.append(x) if x>GScoreThreshold and pvalue<0.05: FinalScores.append(newscore) ControlScores.insert(0,SliceB) #push back B elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes #This is not interesting, ignored SampleScores.insert(0,SliceA) #push back A else: #same chromosome Chrom=SliceA[1] segA=SliceA[2:4] segB=SliceB[2:4] re=RelationshipOf(segA,segB) if re=='equal' or re=='cover' or re=='in' or re=='overlap' : region=SliceA[1:4] ## print(region) M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap) a=M[0] b=M[1] c=M[2] d=M[3] pvalue=mcnemar(a,b,c,d) if pvalue<1e-10: pvalue=1e-10 logp=-math.log10(pvalue) newscore=SliceA[1:4] newscore.append(logp) x=SliceA[5]*1.0/nSamples-SliceB[5]*1.0/nControls newscore.append(x) if x>GScoreThreshold and pvalue<0.05: FinalScores.append(newscore) elif re=='no relationship': if segA[0]<segB[0]: #A is before B region=SliceA[1:4] ## print(region) M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap) a=M[0] b=M[1] c=M[2] d=M[3] pvalue=mcnemar(a,b,c,d) logp=-math.log10(pvalue) newscore=SliceA[1:4] newscore.append(logp) x=SliceA[5]*1.0/nSamples # print("a="+str(a)+", nSamples="+str(nSamples)+", x="+str(x)) newscore.append(x) if x>GScoreThreshold and pvalue<0.05: FinalScores.append(newscore) ControlScores.insert(0,SliceB) #push back B else: #A is after B #This is not interesting, ignored SampleScores.insert(0,SliceA) #push back A else: print("This is impossible!") exit(325) if len(SampleScores)>0: while len(SampleScores)>0 : SliceA=SampleScores.pop(0) #order: array, chromosome, start, end, p-value, sum, just ignore array region=SliceA[1:4] M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap) a=M[0] b=M[1] c=M[2] d=M[3] pvalue=mcnemar(a,b,c,d) logp=-math.log10(pvalue) newscore=SliceA[1:4] newscore.append(logp) x=SliceA[5]*1.0/nSamples # print("a="+str(a)+", nSamples="+str(nSamples)+", x="+str(x)) newscore.append(x) if x>GScoreThreshold and pvalue<0.05: FinalScores.append(newscore) ##print("\n\n\n") ##for debugrowtoprint in FinalScores: ## print(debugrowtoprint) #print(FinalScores) #Merge nearby peaks, critical parameter: GaptoMerge, MinimumPeakWidth print("Now merge nearby peaks.") MergedFinalScores=list() CurrentPeak=list() for row in FinalScores: if len(CurrentPeak)==0:#first peak CurrentPeak=row[:] else: if CurrentPeak[0]!=row[0]:#different chromosome MergedFinalScores.append(CurrentPeak)#save CurrentPeak CurrentPeak=row[:] #move on else: n=row[1]-CurrentPeak[2] if n<=GaptoMerge:#merge peaks CurrentPeak[2]=row[2] else:# do not merge MergedFinalScores.append(CurrentPeak)#save CurrentPeak CurrentPeak=row[:] if len(CurrentPeak)>0: MergedFinalScores.append(CurrentPeak)#save the last CurrentPeak print("Now remove peaks less then "+str(MinimumPeakWidth) +" bp.") FinalScores=list() #FinalScores will be restored here #print(len(MergedFinalScores)) for row in MergedFinalScores: print(row) if row[2]-row[1]>=MinimumPeakWidth: FinalScores.append(row) #recalculate GScore and P value #SampleSegments and ControlSegments, the order: array, [array, chromosome, start, end], segments are sorted print("Now recalculate the GScore and P value.") NewFinalScores=list() ofile=open(StatisticsOutput,'w',newline='') stat_writer=csv.writer(ofile, delimiter='\t') header=['chromosome','Start','End','kbp','a','b','c','d','pvalue'] stat_writer.writerow(header) for peak in FinalScores: #print(peak) region=peak[0:3] M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap) print(M) a=M[0] b=M[1] c=M[2] d=M[3] pvalue=mcnemar(a,b,c,d) if pvalue<1e-10: print('Warnin: extremely small pvalue found',pvalue,'assigned 1e-10') pvalue=1e-10 logp=-math.log10(pvalue) if pvalue<pvaluelimit : #1.3 is p=0.05, 2 is for p=0.01, 3 for p=0.001 #c/d is the odds ratio of control peak[3]=logp peak[4]=x NewFinalScores.append(peak) #header=['chromosome','Start','End','kbp','a','b','c','d'] stat_writer.writerow([peak[0],peak[1],peak[2],(peak[2]-peak[1])/1000.0,a,b,c,d,float("%.2e"%pvalue)]) FinalScores=NewFinalScores#restore FinalScores ofile.close() #zero-fill the spaces print("Now zero fill.") ptrProbe=0 #initialize the pointer FinalScoresZeros=list() for row in FinalScores: # print("row in FinalScores : "+str(row)) zl=ZeroFillSpaces(ptrProbe,PreviousProbeOf([row[0],row[1]])) for z in zl: FinalScoresZeros.append(z) FinalScoresZeros.append(row) ptrProbe=NextProbeOf([row[0],row[2]]) #after looping the last loh, seal the "zero fill" #print("Now seal the zero fill...") zl=ZeroFillSpaces(ptrProbe,len(probes)-1) for z in zl: FinalScoresZeros.append(z) #log print("logging...") ofile=open(outputfile,'w',newline='') writer=csv.writer(ofile, delimiter='\t') header=['Type','Chromosome','Start','End','-log10(q-value)','G-score','average','amplitute','frequency'] writer.writerow(header) for row in FinalScoresZeros: #order: chromosome, start, end, p-value, sum if row[0]==23: row[0]='X' elif row[0]==24: row[0]='Y' row.insert(0,"Del") row.append(0.0) row.append(0.0) row.append(0.0) writer.writerow(row) ofile.close() print ("Analysis is finished successfully! ") endclock=time.clock() print("The program ends at: "+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) print("%.1f"%(endclock-startclock)+" seconds used.")
Texto modificado
Abrir archivo
#LOH probefile=r'C:\Users\ly091\Documents\probes_SNP6.0_hg18.txt' ##SampleScoreFile = r'C:\Users\ly091\Documents\result8\Gain\tumor.Gain.Score.txt' ##ControlScoreFile = r'C:\Users\ly091\Documents\result8\Gain\Germline.Gain.Score.txt' GScoreThreshold=0.01#minimum Gscore to be recorded Recordstoread=10000000#for debug EchoPerLines=1000#verbose GaptoMerge=500#minimun width of a gap between two peaks to be merged, unit basepair MinimumPeakWidth=2000#minimum Peak Width, unit base pair pvaluelimit=0.01 #Global variables probes=list()#from the probe file, will be filled and sorted FirstMarkersonChromosomes=dict()#First markers of chromosomes LastMarkersonChromosomes=dict() #Last markers of chromosomes #Functions def mcnemar(a,b,c,d): """ Input args: a, b, c, d- frequencies (a: ++, b: +-, c: -+, d: --) b and c is interchangable Output: pvalue of test. """ if b+c>0: chi2testval = (abs(b-c) - 1)**2/ (b + c) df = 1 pvalue = 1 - stats.chi2.cdf(chi2testval, df) else: pvalue=1.0 return pvalue def AbeforeB(a,b): #compare postions of two probes, a and b contain: [chromosome, base pair position] if a[0]<b[0]: return True elif a[0]>b[0]: return False else: #case: in the same chromosome if a[1]<b[1]: return True elif a[1]>b[1]: return False else: return True #same position, return True here def LocateProbe(probe):#probe has: chromosome number, start size p1=0 p2=len(probes)-1 while True: pcenter=int((p1+p2)/2) #print("p1="+str(p1)+"\tp2="+str(p2)+"\tpcenter="+str(pcenter)) if probes[pcenter]==probe: return pcenter elif probes[p1]==probe: return p1 elif probes[p2]==probe: return p2 elif AbeforeB(probes[pcenter],probe):#take the right half if p1!=pcenter: p1=pcenter else: print("cannot find probe") print(str(probe)) exit(577) else: #take the left half if p2!=pcenter: p2=pcenter else: print("cannot find probe") print(str(probe)) exit(577) def LeftTarget(probe):#probe has: chromosome number, start size ## print(probe) p1=0 ## print(len(gtarget_numbers)) while p1<len(gtarget_numbers): if gtarget_numbers[p1][1]==probe[0]:#same chromosome #print("goes here") if gtarget_numbers[p1][2]>=probe[1]:#first meet, note gtarget_numbers is ordered if p1==0: return 0 else: return p1-1 p1+=1 if p1==len(gtarget_numbers): return -111111111111 def RightTarget(probe):#probe has: chromosome number, start size # print(probe) p1=1 while p1<len(gtarget_numbers): ## print(gtarget_numbers[-p1]) ## print(p1) ## print(probe) if gtarget_numbers[-p1][1]==probe[0]:#same chromosome if gtarget_numbers[-p1][3]<=probe[1]:#first meet from right, note gtarget_numbers is ordered if p1==1: return len(gtarget_numbers)-1 else: return len(gtarget_numbers)-p1+1 p1+=1 if p1==len(gtarget_numbers): return p1 def CountMarkers(nchr,nstart,nend): ## print("in countmarkers") ## print(nchr) ## print(nstart) ## print(nend) return LocateProbe([nchr,nend])-LocateProbe([nchr,nstart])+1 def PreviousProbeOf(element): #element is a list with chromosome and probe position, e.g.[2,242953866] n=LocateProbe(element)-1 if n<0: return 0 else: return n def NextProbeOf(element): n= LocateProbe(element)+1 if n>len(probes)-1: return len(probes)-1 else: return n def RelationshipOf(seg1,seg2): ## function RelationshipOf(seg1 as list, seg2 as list) ## compare two segments seg1 and seg2, ## seg1 and seg2 are two lists, each with the Min and Max values of the segment, ## Min should be the first value in the list and Max the second. ## return a string indicating the relationship between the two segments: ## "equal" : identical ## "no relationship" : seg1 and seg2 are not in the same locus, they don't overlap ## "in": seg1 is entirely within seg2 ## "cover": seg1 is bigger than seg2 and completely contains seg2 ## "overlap": seg1 partially overlaps with seg2 if seg1[0]==seg2[0] and seg1[1]==seg2[1]: return "equal" elif seg1[0]>=seg2[0] and seg1[1]<=seg2[1]: return "in" elif seg1[0]<=seg2[0] and seg1[1]>=seg2[1]: return "cover" if (seg1[0]<seg2[0] and seg1[1]<seg2[1] and seg1[1]>seg2[0]) or (seg1[0]>seg2[0] and seg1[1]>seg2[1] and seg1[0]<seg2[1]) : return "overlap" else : return "no relationship" def ZeroFillSpaces(StartProbeIndex,EndProbeIndex): #return a list of lists each is a properly formated segment with value of 0.0 #StartProbeINdex and EndProbeIndex are two pointers to positions in the "probes" list which is sorted lists_of_zerofills=list() #print("dump in ZeroFillSpaces, StartProbeIndex, EndProbeIndex:"+str(StartProbeIndex)+","+str(EndProbeIndex)) if StartProbeIndex< 0 or StartProbeIndex>= len(probes): print("Invalid StartProbeIndex in function ZeroFillSpaces: "+str(StartProbeIndex)) quit(767) elif EndProbeIndex <0 or EndProbeIndex>=len(probes): print("Invalid EndProbeIndex in function ZeroFillSpaces: "+str(EndProbeIndex)) quit(768) elif StartProbeIndex>=EndProbeIndex: return lists_of_zerofills else: pass stChr=probes[StartProbeIndex][0] stBase=probes[StartProbeIndex][1] endChr=probes[EndProbeIndex][0] endBase=probes[EndProbeIndex][1] while stChr<=endChr: if stChr==endChr: if stBase<endBase: n=0 lists_of_zerofills.append([stChr,stBase,endBase,0.0,0.0,0.0,0.0]) else: break else: endMark=LastMarkersonChromosomes[stChr] lists_of_zerofills.append([stChr,stBase,endMark,0.0,0.0,0.0,0.0]) #move on to next chromosome stChr+=1 if stChr>24: break else: stBase=FirstMarkersonChromosomes[stChr] return lists_of_zerofills def getCytobands(chromosome,startn,endn): #note: chromosome is 1-22 and X or Y try: ## cnx = mysql.connector.connect(user='genome', host='genome-mysql.cse.ucsc.edu', ## database='hg19') cnx = mysql.connector.connect(user='root', password='ssqll', host='127.0.0.1',port='3306', database='ucsc_hg18') except mysql.connector.Error as err: print("Error occured during obtaining cytoband information from local database.") if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: print("Something is wrong with your user name or password") elif err.errno == errorcode.ER_BAD_DB_ERROR: print("Database does not exists") else: print(err) cursor = cnx.cursor() query = ("SELECT name" " FROM cytoBand " " WHERE chrom=\"chr"+str(chromosome)+"\" " " and ((chromStart<="+str(endn)+" AND chromEndStart>="+str(startn)+")" " ORDER BY name") #print(query) cursor.execute(query) cytobands='' for bandname in cursor: cytobands=cytobands+''+bandname cursor.close() cnx.close() return cytobands def getmatrix(region,SampleSegments,ControlSegments,codemap,arraymap): #region is a peak, as a list, in order: chromosome, start, end peak=region #first initiate codemap to 0 for x in codemap.keys(): codemap[x]=[0,0] for Array in SampleSegments: cumulativelength=0 for row in Array[1]: if row[1]==peak[0]:#same chromosome segA=[peak[1],peak[2]] segB=[row[2],row[3]] re=RelationshipOf(segA,segB) if re=="no relationship":#do nothing "no relationship" #print("goes here") pass elif re=="in" or re=="equal": codemap[arraymap[Array[0]]][0]=1 break #this sample has been counted, so no need to check other segments in this sample elif re=="cover": sa=segA[1]-segA[0] sb=segB[1]-segB[0] if 10*sb+cumulativelength>sa: #set the limit how small segB can be compared to segA in "cover" codemap[arraymap[Array[0]]][0]=1 break #this sample has been counted, so no need to check other segments in this sample else: cumulativelength+=sb #print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)) elif re=="overlap": sa=segA[1]-segA[0] sb=segB[1]-segB[0] if segA[0]<segB[0]: #left overlap sb_overlap=segA[1]-segB[0] sb_outside=segB[1]-segA[1] else: #right overlap sb_overlap=segB[1]-segA[0] sb_outside=segA[0]-segA[0] #if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap if cumulativelength+10*sb_overlap>sa: #test of substantial overlap codemap[arraymap[Array[0]]][0]=1 break #this sample has been counted, so no need to check other segments in this sample else: cumulativelength+=sb_overlap #print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.)) else: print("Strange relationship: "+re) exit(436) for Array in ControlSegments: cumulativelength=0 for row in Array[1]: if row[1]==peak[0]:#same chromosome segA=[peak[1],peak[2]] segB=[row[2],row[3]] re=RelationshipOf(segA,segB) if re=="no relationship":#do nothing pass elif re=="in" or re=="equal": codemap[arraymap[Array[0]]][1]=1 break #this Control has been counted, so no need to check other segments in this Control elif re=="cover": sa=segA[1]-segA[0] sb=segB[1]-segB[0] if 10*sb+cumulativelength>sa: #set the limit how small segB can be compared to segA in "cover" codemap[arraymap[Array[0]]][1]=1 break #this Control has been counted, so no need to check other segments in this Control else: cumulativelength+=sb #print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)) elif re=="overlap": sa=segA[1]-segA[0] sb=segB[1]-segB[0] if segA[0]<segB[0]: #left overlap sb_overlap=segA[1]-segB[0] sb_outside=segB[1]-segA[1] else: #right overlap sb_overlap=segB[1]-segA[0] sb_outside=segA[0]-segA[0] #if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap if cumulativelength+10*sb_overlap>sa: #test of substantial overlap codemap[arraymap[Array[0]]][1]=1 break #this Control has been counted, so no need to check other segments in this Control else: cumulativelength+=sb_overlap #print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.)) else: print("Strange relationship: "+re) exit(436) #calculate a,b,c,d a=b=c=d=0 for x in codemap.keys(): if codemap[x][0]==1 and codemap[x][1]==1:# + + a+=1 elif codemap[x][0]==1 and codemap[x][1]==0:# + - b+=1 elif codemap[x][0]==0 and codemap[x][1]==1:# + - c+=1 elif codemap[x][0]==0 and codemap[x][1]==0:# + - d+=1 return [a,b,c,d] # main execution starts here import os import os.path import csv import shutil import time import operator import copy import scipy import math from scipy import stats import mysql.connector from mysql.connector import errorcode startclock=time.clock() print ("\n"*5) print("The program starts at: "+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) # read probes print("Now read probes.") ifile = open(probefile, "r") reader = csv.reader(ifile,delimiter='\t') for row in reader: try: n=0 if row[1]=='X': #print("probes on X found") n=23 elif row[1]=='Y': n=24 elif row[1]=='MT':#ignore MT continue else: n=int(row[1]) #note: start size position has been added by 1 as below!! probes.append([n,int(row[2])]) except: print(row) quit(963) ifile.close() probes=sorted(probes,key=operator.itemgetter(0,1)) #initiate FirstMarkers and LastMarkers on chromosomes ##FirstMarkersonChromosomes=dict()#First markers of chromosomes ##LastMarkersonChromosomes=dict() #Last markers of chromosomes n=0 for index,p in enumerate(probes): if p[0]!=n: #first occation of a chromosome FirstMarkersonChromosomes[p[0]]=(p[1]) if n!=0: #not just start LastMarkersonChromosomes[probes[index-1][0]]=probes[index-1][1] n=p[0] LastMarkersonChromosomes[probes[-1][0]]=probes[-1][1] #for Y chromosome, which is the last element in the probes list SegmentsDict=dict() ScoresDict=dict() sampleinformationfile=r'C:\Users\ly091\Documents\FixedOPtimalPairs_LY7.15.txt' for loopcontrol in range(1,4): if loopcontrol ==1: #FamilialTumor-FamilialGermline SampleSize99=ControlSize99=42 Samples = r'C:\Users\ly091\Documents\result8\Gain\FamilialTumor.Gain.centroless.seg' Controls= r'C:\Users\ly091\Documents\result8\Gain\FamilialGermline.Gain.centroless.seg' outputfile=r'C:\Users\ly091\Documents\result8\Gain\FamilialTumor-FamiliarGermline.Gain.mc.gistic' StatisticsOutput=r'C:\Users\ly091\Documents\result8\Gain\FamilialTumor-FamiliarGermline.Gain.mc.statistics.txt' patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\FamilialTumor-FamilialGermline.codes.txt' elif loopcontrol ==2: #SporadicTumor - SporadicGermline SampleSize99=ControlSize99=91 Samples = r'C:\Users\ly091\Documents\result8\Gain\sporadicTumor.Gain.centroless.seg' Controls= r'C:\Users\ly091\Documents\result8\Gain\sporadicGermline.Gain.centroless.seg' outputfile=r'C:\Users\ly091\Documents\result8\Gain\SporadicTumor-SporadicGermline.Gain.mc.gistic' StatisticsOutput=r'C:\Users\ly091\Documents\result8\Gain\SporadicTumor-SporadicGermline.Gain.mc.statistics.txt' patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\SporadicTumor-SporadicGermline.codes.txt' elif loopcontrol ==3: #Tumor-Germline SampleSize99=ControlSize99=133 Samples = r'C:\Users\ly091\Documents\result8\Gain\tumor.Gain.centroless.seg' Controls= r'C:\Users\ly091\Documents\result8\Gain\germline.Gain.centroless.seg' outputfile=r'C:\Users\ly091\Documents\result8\Gain\Tumor-Germline.Gain.mc.gistic' StatisticsOutput=r'C:\Users\ly091\Documents\result8\Gain\Tumor-Germline.Gain.mc.statistics.txt' patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\Tumor-Germline.codes.txt' else: print("Error in control.") exit(444) print("Now build the code map and the array map...") #build the code map codemap=dict()#to store the sample information file ifile = open(patientcodes, "r") reader = csv.reader(ifile,delimiter='\t') for row in reader: if len(row)==0: pass else: codemap[row[0]]=[0,0]#this is initialized to store the sample and control of this patients have segment in a particular genomic region #0 is no, 1 is yes ifile.close() #build the array map arraymap=dict() ifile = open(sampleinformationfile, "r") reader = csv.reader(ifile,delimiter='\t') rownum = 0 for row in reader: if rownum == 0: rownum+=1 if row[17]!='Sample2': print("File error!") quit(883) else: sample2=row[17] if sample2 in codemap: arraymap[row[0].split('.')[0]]=sample2 else: arraymap[row[0].split('.')[0]]='unused' #this is a tag for arrays that should not be used. ifile.close() if Samples in SegmentsDict: print("\n*** Use saved results for "+Samples +" ***\n") SampleSegments=copy.deepcopy(SegmentsDict[Samples]) SampleScores=copy.deepcopy(ScoresDict[Samples]) else: #read Samples: print("Now read Samples.") SampleSegments=list() allsegments_unsorted=list() ifile = open(Samples, "r") reader = csv.reader(ifile,delimiter='\t') rownum = 0 ndebug=0 for row in reader: if rownum == 0: rownum+=1 else: chromosome=0 if row[1]=='X': chromosome=23 elif row[1]=='Y': chromosome=24 else: chromosome=int(row[1]) # if chromosome==4: if True: if ndebug>Recordstoread: print("Warning: Sample file is not read to the end.") break if arraymap[row[0]]!='unused': ndebug+=1 allsegments_unsorted.append([row[0],chromosome, int(row[2]), int(row[3])]) ifile.close() allsegments=sorted(allsegments_unsorted,key=operator.itemgetter(0,1,2,3)) array='' for row in allsegments: if row[0]!=array: array=row[0] newarray=[array,[row]] SampleSegments.append(newarray)#order: array, [array, chromosome, start, end] else: SampleSegments[-1][1].append(row) SampleSegmentsBackup=copy.deepcopy(SampleSegments)#order: array, [array, chromosome, start, end], it is sorted cp=0 SliceA=[] SliceB=[] pt=list() #list of pointers to the currently working segment in SampleSegments, #Calculate G-scores for Samples SampleScores=list() #chromosome, start, end, p-value, G-score #remember this is for LOH print("Now Calculate G-Scores for Samples.") #print(len(SampleSegments)) for row in SampleSegments: pt.append(0)#initiate the pointers to 0 #Find the slice while True: #pre-set SliceA #print("main while") empty=True for index, row in enumerate(SampleSegments): if pt[index]<len(row[1]): ## print(row[1][pt[index]]) lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2] if lengthofsegment<2000000:#ignore very long segments: SliceA=row[1][pt[index]][:] #order: array, chromosome, start, end, just ignore array empty=False break #print(empty) if empty: break ## print("\nBefore refinement,SliceA"+str(SliceA)) #refine SliceA for index,row in enumerate(SampleSegments): if pt[index]<len(row[1]): lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2] ## print(lengthofsegment) if lengthofsegment>=2000000:#ignore very long segments # pt[index]+=1 continue SliceB=row[1][pt[index]][:] ## print("\nIn refinement,SliceA:"+str(SliceA)) ## print("In refinement, SliceB:"+str(SliceB)) if SliceA[1]<SliceB[1]: #A is before B in different chromosomes pass elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes SliceA=SliceB[:] pass else: #same chromosome Chrom=SliceA[1] segA=SliceA[2:4] segB=SliceB[2:4] re=RelationshipOf(segA,segB) if re=='equal': pass elif re=='no relationship': if segA[0]<segB[0]: #A is before B pass else: #A is after B SliceA=SliceB[:] elif re=='cover': if SliceA[2]<SliceB[2]: SliceA[3]=SliceB[2] elif SliceA[2]==SliceB[2]: SliceA=SliceB[:] else: print("This is impossible!") exit(388) elif re=='in': if SliceA[2]>SliceB[2]: SliceA[2]=segB[0] SliceA[3]=segA[0] elif SliceA[2]==SliceB[2]: pass else: print("This is impossible!") exit(554) elif re=='overlap': if SliceA[2]<SliceB[2]: #A before B SliceA[3]=SliceB[2] else: #A after B SliceA[3]=SliceA[2] SliceA[2]=SliceB[2] else: print(re) print("This is impossible!") exit(323) #use SliceA to go through all segments if cp%EchoPerLines==0: ## if True: print(str(cp)+": Now working on Samples. SliceA after refinery: "+str(SliceA)) cp+=1 SumSample=0 for index, row in enumerate(SampleSegments): if pt[index]<len(row[1]): SliceB=row[1][pt[index]]#segment to be tested if SliceB[2]==SliceB[3]:#remove 0 length segments from the que. pt[index]+=1 continue ## print("\nSliceA: "+str(SliceA)) ## print("SliceB: "+str(SliceB)) if SliceA[1]<SliceB[1]: #A is before B in different chromosomes ## print("A is before B in different chromosomes.") continue elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes pt[index]+=1 continue else: #same chromosome Chrom=SliceA[1] segA=SliceA[2:4] segB=SliceB[2:4] re=RelationshipOf(segA,segB) ## print(re) if re=='equal': SumSample=SumSample+1 #1.0 is for LOH only because no amplitude information is used for LOH pt[index]+=1 #move the pointer forward by 1 elif re=='no relationship': if segA[0]<segB[0]: #A is before B continue else: #A is after B pt[index]+=1 continue elif re=='cover': print("This is impossible!") print("\nSliceA: "+str(SliceA)) print("SliceB: "+str(SliceB)) exit(991) elif re=='in': SumSample=SumSample+1 #1.0 is for LOH only because no amplitude information is used for LOH if SliceB[3]>SliceA[3]: SliceB[2]=SliceA[3] else: pt[index]+=1 elif re=='overlap': SumSample+=1 if SliceA[2]<SliceB[2]: SliceB[2]=SliceA[3] else: pt[index]+=1 else: print("This is impossible!") exit(324) SliceA.append(0.0) SliceA.append(SumSample)#append p value and frequency of LOH ## print("After summation,SliceA"+str(SliceA)) SampleScores.append(SliceA) SampleSegments=SampleSegmentsBackup # SampleSegments restored, segments are sorted if Samples not in SegmentsDict: SegmentsDict[Samples]=copy.deepcopy(SampleSegments) ScoresDict[Samples]=copy.deepcopy(SampleScores) if Controls in SegmentsDict: print("\n*** Use saved results for "+Controls +" ***\n") ControlSegments=copy.deepcopy(SegmentsDict[Controls]) ControlScores=copy.deepcopy(ScoresDict[Controls]) else: #read Controls: print("Now read Controls.") ControlSegments=list() allsegments_unsorted=list() ifile = open(Controls, "r") reader = csv.reader(ifile,delimiter='\t') rownum = 0 ndebug=0 for row in reader: if rownum == 0: rownum+=1 else: chromosome=0 if row[1]=='X': chromosome=23 elif row[1]=='Y': chromosome=24 else: chromosome=int(row[1]) # if chromosome==4: if True: if ndebug>Recordstoread: print("Warning: Sample file is not read to the end.") break if arraymap[row[0]]!='unused': ndebug+=1 allsegments_unsorted.append([row[0],chromosome, int(row[2]), int(row[3])]) ifile.close() allsegments=sorted(allsegments_unsorted,key=operator.itemgetter(0,1,2,3)) array='' for row in allsegments: if row[0]!=array: array=row[0] newarray=[array,[row]] ControlSegments.append(newarray) else: ControlSegments[-1][1].append(row) ControlSegmentsBackup=copy.deepcopy(ControlSegments)#order: array, [array, chromosome, start, end], it is sorted cp=0 SliceA=[] SliceB=[] pt=list() #list of pointers to the currently working segment in ControlSegments, #Calculate G-scores for Controls ControlScores=list() #chromosome, start, end, p-value, G-score #remember this is for LOH print("Now Calculate G-Scores for Controls.") #print(len(ControlSegments)) for row in ControlSegments: pt.append(0)#initiate the pointers to 0 #Find the slice while True: #pre-set SliceA #print("main while") empty=True for index, row in enumerate(ControlSegments): if pt[index]<len(row[1]): ## print(row[1][pt[index]]) lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2] if lengthofsegment<2000000:#ignore very long segments: SliceA=row[1][pt[index]][:] #order: array, chromosome, start, end, just ignore array empty=False break #print(empty) if empty: break ## print("\nBefore refinement,SliceA"+str(SliceA)) #refine SliceA for index,row in enumerate(ControlSegments): if pt[index]<len(row[1]): lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2] ## print(lengthofsegment) if lengthofsegment>=2000000:#ignore very long segments # pt[index]+=1 continue SliceB=row[1][pt[index]][:] ## print("\nIn refinement,SliceA:"+str(SliceA)) ## print("In refinement, SliceB:"+str(SliceB)) if SliceA[1]<SliceB[1]: #A is before B in different chromosomes pass elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes SliceA=SliceB[:] pass else: #same chromosome Chrom=SliceA[1] segA=SliceA[2:4] segB=SliceB[2:4] re=RelationshipOf(segA,segB) if re=='equal': pass elif re=='no relationship': if segA[0]<segB[0]: #A is before B pass else: #A is after B SliceA=SliceB[:] elif re=='cover': if SliceA[2]<SliceB[2]: SliceA[3]=SliceB[2] elif SliceA[2]==SliceB[2]: SliceA=SliceB[:] else: print("This is impossible!") exit(388) elif re=='in': if SliceA[2]>SliceB[2]: SliceA[2]=segB[0] SliceA[3]=segA[0] elif SliceA[2]==SliceB[2]: pass else: print("This is impossible!") exit(554) elif re=='overlap': if SliceA[2]<SliceB[2]: #A before B SliceA[3]=SliceB[2] else: #A after B SliceA[3]=SliceA[2] SliceA[2]=SliceB[2] else: print(re) print("This is impossible!") exit(323) #use SliceA to go through all segments if cp%EchoPerLines==0: ## if True: print(str(cp)+": Now working on Controls. SliceA after refinery: "+str(SliceA)) cp+=1 SumControl=0 for index, row in enumerate(ControlSegments): if pt[index]<len(row[1]): SliceB=row[1][pt[index]]#segment to be tested if SliceB[2]==SliceB[3]:#remove 0 length segments from the que. pt[index]+=1 continue ## print("\nSliceA: "+str(SliceA)) ## print("SliceB: "+str(SliceB)) if SliceA[1]<SliceB[1]: #A is before B in different chromosomes ## print("A is before B in different chromosomes.") continue elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes pt[index]+=1 continue else: #same chromosome Chrom=SliceA[1] segA=SliceA[2:4] segB=SliceB[2:4] re=RelationshipOf(segA,segB) ## print(re) if re=='equal': SumControl=SumControl+1 #1.0 is for LOH only because no amplitude information is used for LOH pt[index]+=1 #move the pointer forward by 1 elif re=='no relationship': if segA[0]<segB[0]: #A is before B continue else: #A is after B pt[index]+=1 continue elif re=='cover': print("This is impossible!") print("\nSliceA: "+str(SliceA)) print("SliceB: "+str(SliceB)) exit(991) elif re=='in': SumControl=SumControl+1 #1.0 is for LOH only because no amplitude information is used for LOH if SliceB[3]>SliceA[3]: SliceB[2]=SliceA[3] else: pt[index]+=1 elif re=='overlap': SumControl+=1 if SliceA[2]<SliceB[2]: SliceB[2]=SliceA[3] else: pt[index]+=1 else: print("This is impossible!") exit(324) SliceA.append(0.0) SliceA.append(SumControl)#append p value and frequency of LOH ## print("After summation,SliceA"+str(SliceA)) ControlScores.append(SliceA) ControlSegments=ControlSegmentsBackup # ControlSegments restored, segments are sorted if Controls not in SegmentsDict: SegmentsDict[Controls]=copy.deepcopy(ControlSegments) ScoresDict[Controls]=copy.deepcopy(ControlScores) #Merge SampleScores and ControlScores, calculate subtracted G-score, calculate p-value using Mcnemar's test nSamples=len(SampleSegments) nControls=len(ControlSegments) FinalScores=list() ## The 2x2 contigency table for Fisher's Exact test ## ## with CNA without CNA ## Sample a b ## Control c d print("\nNow merge Samples and Controls.") while len(SampleScores)>0 and len(ControlScores)>0: SliceA=SampleScores.pop(0) #order: array, chromosome, start, end, p-value, sum, just ignore array SliceB=ControlScores.pop(0) #The naming of SliceA and SliceB is for convenience if SliceA[1]<SliceB[1]: #A is before B in different chromosomes #def getmatrix(region,SampleSegments,ControlSegments,codemap,arraymap): region=SliceA[1:4] ## print(region) M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap) ## print(M) a=M[0] b=M[1] c=M[2] d=M[3] pvalue=mcnemar(a,b,c,d) logp=-math.log10(pvalue) ## if logp<0.000001: ## print("a="+str(a)+", b="+str(b)+", c="+str(c)+", d="+str(d)) newscore=SliceA[1:4] newscore.append(logp) x=SliceA[5]*1.0/nSamples newscore.append(x) if x>GScoreThreshold and pvalue<0.05: FinalScores.append(newscore) ControlScores.insert(0,SliceB) #push back B elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes #This is not interesting, ignored SampleScores.insert(0,SliceA) #push back A else: #same chromosome Chrom=SliceA[1] segA=SliceA[2:4] segB=SliceB[2:4] re=RelationshipOf(segA,segB) if re=='equal' or re=='cover' or re=='in' or re=='overlap' : region=SliceA[1:4] ## print(region) M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap) a=M[0] b=M[1] c=M[2] d=M[3] pvalue=mcnemar(a,b,c,d) if pvalue<1e-10: pvalue=1e-10 logp=-math.log10(pvalue) newscore=SliceA[1:4] newscore.append(logp) x=SliceA[5]*1.0/nSamples-SliceB[5]*1.0/nControls newscore.append(x) if x>GScoreThreshold and pvalue<0.05: FinalScores.append(newscore) elif re=='no relationship': if segA[0]<segB[0]: #A is before B region=SliceA[1:4] ## print(region) M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap) a=M[0] b=M[1] c=M[2] d=M[3] pvalue=mcnemar(a,b,c,d) logp=-math.log10(pvalue) newscore=SliceA[1:4] newscore.append(logp) x=SliceA[5]*1.0/nSamples # print("a="+str(a)+", nSamples="+str(nSamples)+", x="+str(x)) newscore.append(x) if x>GScoreThreshold and pvalue<0.05: FinalScores.append(newscore) ControlScores.insert(0,SliceB) #push back B else: #A is after B #This is not interesting, ignored SampleScores.insert(0,SliceA) #push back A else: print("This is impossible!") exit(325) if len(SampleScores)>0: while len(SampleScores)>0 : SliceA=SampleScores.pop(0) #order: array, chromosome, start, end, p-value, sum, just ignore array region=SliceA[1:4] M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap) a=M[0] b=M[1] c=M[2] d=M[3] pvalue=mcnemar(a,b,c,d) logp=-math.log10(pvalue) newscore=SliceA[1:4] newscore.append(logp) x=SliceA[5]*1.0/nSamples # print("a="+str(a)+", nSamples="+str(nSamples)+", x="+str(x)) newscore.append(x) if x>GScoreThreshold and pvalue<0.05: FinalScores.append(newscore) ##print("\n\n\n") ##for debugrowtoprint in FinalScores: ## print(debugrowtoprint) #print(FinalScores) #Merge nearby peaks, critical parameter: GaptoMerge, MinimumPeakWidth print("Now merge nearby peaks.") MergedFinalScores=list() CurrentPeak=list() for row in FinalScores: if len(CurrentPeak)==0:#first peak CurrentPeak=row[:] else: if CurrentPeak[0]!=row[0]:#different chromosome MergedFinalScores.append(CurrentPeak)#save CurrentPeak CurrentPeak=row[:] #move on else: n=row[1]-CurrentPeak[2] if n<=GaptoMerge:#merge peaks CurrentPeak[2]=row[2] else:# do not merge MergedFinalScores.append(CurrentPeak)#save CurrentPeak CurrentPeak=row[:] if len(CurrentPeak)>0: MergedFinalScores.append(CurrentPeak)#save the last CurrentPeak print("Now remove peaks less then "+str(MinimumPeakWidth) +" bp.") FinalScores=list() #FinalScores will be restored here #print(len(MergedFinalScores)) for row in MergedFinalScores: print(row) if row[2]-row[1]>=MinimumPeakWidth: FinalScores.append(row) #recalculate GScore and P value #SampleSegments and ControlSegments, the order: array, [array, chromosome, start, end], segments are sorted print("Now recalculate the GScore and P value.") NewFinalScores=list() ofile=open(StatisticsOutput,'w',newline='') stat_writer=csv.writer(ofile, delimiter='\t') header=['chromosome','Start','End','kbp','a','b','c','d','pvalue'] stat_writer.writerow(header) for peak in FinalScores: #print(peak) region=peak[0:3] M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap) print(M) a=M[0] b=M[1] c=M[2] d=M[3] pvalue=mcnemar(a,b,c,d) logp=-math.log10(pvalue) if pvalue<pvaluelimit : #1.3 is p=0.05, 2 is for p=0.01, 3 for p=0.001 #c/d is the odds ratio of control peak[3]=logp peak[4]=x NewFinalScores.append(peak) #header=['chromosome','Start','End','kbp','a','b','c','d'] stat_writer.writerow([peak[0],peak[1],peak[2],(peak[2]-peak[1])/1000.0,a,b,c,d,float("%.2e"%pvalue)]) FinalScores=NewFinalScores#restore FinalScores ofile.close() #zero-fill the spaces print("Now zero fill.") ptrProbe=0 #initialize the pointer FinalScoresZeros=list() for row in FinalScores: # print("row in FinalScores : "+str(row)) zl=ZeroFillSpaces(ptrProbe,PreviousProbeOf([row[0],row[1]])) for z in zl: FinalScoresZeros.append(z) FinalScoresZeros.append(row) ptrProbe=NextProbeOf([row[0],row[2]]) #after looping the last loh, seal the "zero fill" #print("Now seal the zero fill...") zl=ZeroFillSpaces(ptrProbe,len(probes)-1) for z in zl: FinalScoresZeros.append(z) #log print("logging...") ofile=open(outputfile,'w',newline='') writer=csv.writer(ofile, delimiter='\t') header=['Type','Chromosome','Start','End','-log10(q-value)','G-score','average','amplitute','frequency'] writer.writerow(header) for row in FinalScoresZeros: #order: chromosome, start, end, p-value, sum if row[0]==23: row[0]='X' elif row[0]==24: row[0]='Y' row.insert(0,"Del") row.append(0.0) row.append(0.0) row.append(0.0) writer.writerow(row) ofile.close() print ("Analysis is finished successfully! ") endclock=time.clock() print("The program ends at: "+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) print("%.1f"%(endclock-startclock)+" seconds used.")
Encontrar la diferencia