Compare text

Find the difference between two text files

Real-time editor

Hide unchanged lines

Disable line wrap

Layout

Diff precision

Syntax highlighting

Diffchecker Desktop The most secure way to run Diffchecker. Get the Diffchecker Desktop app: your diffs never leave your computer!Get Desktop

Untitled diff

Created 11 years agoDiff never expires

31 removals

Lines
Total
Removed

Characters
Total
Removed

To continue using this feature, upgrade to Diffchecker Pro View Pricing

606 lines

30 additions

Lines
Total
Added

Characters
Total
Added

To continue using this feature, upgrade to Diffchecker Pro View Pricing

604 lines

#LOH

probefile=r'C:\Users\ly091\Documents\probes_SNP6.0_hg18.txt'

##SampleScoreFile = r'C:\Users\ly091\Documents\result8\Loss\tumor.Loss.Score.txt'

##SampleScoreFile = r'C:\Users\ly091\Documents\result8\Gain\tumor.Gain.Score.txt'

##ControlScoreFile = r'C:\Users\ly091\Documents\result8\Loss\Germline.Loss.Score.txt'

##ControlScoreFile = r'C:\Users\ly091\Documents\result8\Gain\Germline.Gain.Score.txt'

GScoreThreshold=0.01#minimum Gscore to be recorded

Recordstoread=10000000#for debug

EchoPerLines=1000#verbose

GaptoMerge=500#minimun width of a gap between two peaks to be merged, unit basepair

MinimumPeakWidth=2000#minimum Peak Width, unit base pair

pvaluelimit=0.01

#Global variables

probes=list()#from the probe file, will be filled and sorted

FirstMarkersonChromosomes=dict()#First markers of chromosomes

LastMarkersonChromosomes=dict() #Last markers of chromosomes

#Functions

def mcnemar(a,b,c,d):

"""

Input args:

a, b, c, d- frequencies (a: ++, b: +-, c: -+, d: --) b and c is interchangable

Output:

pvalue of test.

"""

if b+c>0:

chi2testval = (abs(b-c) - 1)**2/ (b + c)

df = 1

pvalue = 1 - stats.chi2.cdf(chi2testval, df)

else:

pvalue=1.0

return pvalue

def AbeforeB(a,b):

#compare postions of two probes, a and b contain: [chromosome, base pair position]

if a[0]<b[0]:

return True

elif a[0]>b[0]:

return False

else: #case: in the same chromosome

if a[1]<b[1]:

return True

elif a[1]>b[1]:

return False

else:

return True #same position, return True here

def LocateProbe(probe):#probe has: chromosome number, start size

p1=0

p2=len(probes)-1

while True:

pcenter=int((p1+p2)/2)

#print("p1="+str(p1)+"\tp2="+str(p2)+"\tpcenter="+str(pcenter))

if probes[pcenter]==probe:

return pcenter

elif probes[p1]==probe:

return p1

elif probes[p2]==probe:

return p2

elif AbeforeB(probes[pcenter],probe):#take the right half

if p1!=pcenter:

p1=pcenter

else:

print("cannot find probe")

print(str(probe))

exit(577)

else: #take the left half

if p2!=pcenter:

p2=pcenter

else:

print("cannot find probe")

print(str(probe))

exit(577)

def LeftTarget(probe):#probe has: chromosome number, start size

## print(probe)

p1=0

## print(len(gtarget_numbers))

while p1<len(gtarget_numbers):

if gtarget_numbers[p1][1]==probe[0]:#same chromosome

#print("goes here")

if gtarget_numbers[p1][2]>=probe[1]:#first meet, note gtarget_numbers is ordered

if p1==0:

return 0

else:

return p1-1

p1+=1

if p1==len(gtarget_numbers):

return -111111111111

def RightTarget(probe):#probe has: chromosome number, start size

# print(probe)

p1=1

while p1<len(gtarget_numbers):

## print(gtarget_numbers[-p1])

## print(p1)

## print(probe)

if gtarget_numbers[-p1][1]==probe[0]:#same chromosome

if gtarget_numbers[-p1][3]<=probe[1]:#first meet from right, note gtarget_numbers is ordered

if p1==1:

return len(gtarget_numbers)-1

else:

return len(gtarget_numbers)-p1+1

p1+=1

if p1==len(gtarget_numbers):

return p1

def CountMarkers(nchr,nstart,nend):

## print("in countmarkers")

## print(nchr)

## print(nstart)

## print(nend)

return LocateProbe([nchr,nend])-LocateProbe([nchr,nstart])+1

def PreviousProbeOf(element):

#element is a list with chromosome and probe position, e.g.[2,242953866]

n=LocateProbe(element)-1

if n<0:

return 0

else:

return n

def NextProbeOf(element):

n= LocateProbe(element)+1

if n>len(probes)-1:

return len(probes)-1

else:

return n

def RelationshipOf(seg1,seg2):

## function RelationshipOf(seg1 as list, seg2 as list)

## compare two segments seg1 and seg2,

## seg1 and seg2 are two lists, each with the Min and Max values of the segment,

## Min should be the first value in the list and Max the second.

## return a string indicating the relationship between the two segments:

## "equal" : identical

## "no relationship" : seg1 and seg2 are not in the same locus, they don't overlap

## "in": seg1 is entirely within seg2

## "cover": seg1 is bigger than seg2 and completely contains seg2

## "overlap": seg1 partially overlaps with seg2

if seg1[0]==seg2[0] and seg1[1]==seg2[1]:

return "equal"

elif seg1[0]>=seg2[0] and seg1[1]<=seg2[1]:

return "in"

elif seg1[0]<=seg2[0] and seg1[1]>=seg2[1]:

return "cover"

if (seg1[0]<seg2[0] and seg1[1]<seg2[1] and seg1[1]>seg2[0]) or (seg1[0]>seg2[0] and seg1[1]>seg2[1] and seg1[0]<seg2[1]) :

return "overlap"

else :

return "no relationship"

def ZeroFillSpaces(StartProbeIndex,EndProbeIndex):

#return a list of lists each is a properly formated segment with value of 0.0

#StartProbeINdex and EndProbeIndex are two pointers to positions in the "probes" list which is sorted

lists_of_zerofills=list()

#print("dump in ZeroFillSpaces, StartProbeIndex, EndProbeIndex:"+str(StartProbeIndex)+","+str(EndProbeIndex))

if StartProbeIndex< 0 or StartProbeIndex>= len(probes):

print("Invalid StartProbeIndex in function ZeroFillSpaces: "+str(StartProbeIndex))

quit(767)

elif EndProbeIndex <0 or EndProbeIndex>=len(probes):

print("Invalid EndProbeIndex in function ZeroFillSpaces: "+str(EndProbeIndex))

quit(768)

elif StartProbeIndex>=EndProbeIndex:

return lists_of_zerofills

else:

pass

stChr=probes[StartProbeIndex][0]

stBase=probes[StartProbeIndex][1]

endChr=probes[EndProbeIndex][0]

endBase=probes[EndProbeIndex][1]

while stChr<=endChr:

if stChr==endChr:

if stBase<endBase:

n=0

lists_of_zerofills.append([stChr,stBase,endBase,0.0,0.0,0.0,0.0])

else:

break

else:

endMark=LastMarkersonChromosomes[stChr]

lists_of_zerofills.append([stChr,stBase,endMark,0.0,0.0,0.0,0.0])

#move on to next chromosome

stChr+=1

if stChr>24:

break

else:

stBase=FirstMarkersonChromosomes[stChr]

return lists_of_zerofills

def getCytobands(chromosome,startn,endn):

#note: chromosome is 1-22 and X or Y

try:

## cnx = mysql.connector.connect(user='genome', host='genome-mysql.cse.ucsc.edu',

## database='hg19')

cnx = mysql.connector.connect(user='root', password='ssqll', host='127.0.0.1',port='3306',

database='ucsc_hg18')

except mysql.connector.Error as err:

print("Error occured during obtaining cytoband information from local database.")

if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:

print("Something is wrong with your user name or password")

elif err.errno == errorcode.ER_BAD_DB_ERROR:

print("Database does not exists")

else:

print(err)

cursor = cnx.cursor()

query = ("SELECT name"

" FROM cytoBand "

" WHERE chrom=\"chr"+str(chromosome)+"\" "

" and ((chromStart<="+str(endn)+" AND chromEndStart>="+str(startn)+")"

" ORDER BY name")

#print(query)

cursor.execute(query)

cytobands=''

for bandname in cursor:

cytobands=cytobands+''+bandname

cursor.close()

cnx.close()

return cytobands

def getmatrix(region,SampleSegments,ControlSegments,codemap,arraymap):

#region is a peak, as a list, in order: chromosome, start, end

peak=region

#first initiate codemap to 0

for x in codemap.keys():

codemap[x]=[0,0]

for Array in SampleSegments:

cumulativelength=0

for row in Array[1]:

if row[1]==peak[0]:#same chromosome

segA=[peak[1],peak[2]]

segB=[row[2],row[3]]

re=RelationshipOf(segA,segB)

if re=="no relationship":#do nothing "no relationship"

#print("goes here")

pass

elif re=="in" or re=="equal":

codemap[arraymap[Array[0]]][0]=1

break #this sample has been counted, so no need to check other segments in this sample

elif re=="cover":

sa=segA[1]-segA[0]

sb=segB[1]-segB[0]

if 10*sb+cumulativelength>sa: #set the limit how small segB can be compared to segA in "cover"

codemap[arraymap[Array[0]]][0]=1

break #this sample has been counted, so no need to check other segments in this sample

else:

cumulativelength+=sb

#print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.))

elif re=="overlap":

sa=segA[1]-segA[0]

sb=segB[1]-segB[0]

if segA[0]<segB[0]: #left overlap

sb_overlap=segA[1]-segB[0]

sb_outside=segB[1]-segA[1]

else: #right overlap

sb_overlap=segB[1]-segA[0]

sb_outside=segA[0]-segA[0]

#if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap

if cumulativelength+10*sb_overlap>sa: #test of substantial overlap

codemap[arraymap[Array[0]]][0]=1

break #this sample has been counted, so no need to check other segments in this sample

else:

cumulativelength+=sb_overlap

#print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.))

else:

print("Strange relationship: "+re)

exit(436)

for Array in ControlSegments:

cumulativelength=0

for row in Array[1]:

if row[1]==peak[0]:#same chromosome

segA=[peak[1],peak[2]]

segB=[row[2],row[3]]

re=RelationshipOf(segA,segB)

if re=="no relationship":#do nothing

pass

elif re=="in" or re=="equal":

codemap[arraymap[Array[0]]][1]=1

break #this Control has been counted, so no need to check other segments in this Control

elif re=="cover":

sa=segA[1]-segA[0]

sb=segB[1]-segB[0]

if 10*sb+cumulativelength>sa: #set the limit how small segB can be compared to segA in "cover"

codemap[arraymap[Array[0]]][1]=1

break #this Control has been counted, so no need to check other segments in this Control

else:

cumulativelength+=sb

#print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.))

elif re=="overlap":

sa=segA[1]-segA[0]

sb=segB[1]-segB[0]

if segA[0]<segB[0]: #left overlap

sb_overlap=segA[1]-segB[0]

sb_outside=segB[1]-segA[1]

else: #right overlap

sb_overlap=segB[1]-segA[0]

sb_outside=segA[0]-segA[0]

#if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap

if cumulativelength+10*sb_overlap>sa: #test of substantial overlap

codemap[arraymap[Array[0]]][1]=1

break #this Control has been counted, so no need to check other segments in this Control

else:

cumulativelength+=sb_overlap

#print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.))

else:

print("Strange relationship: "+re)

exit(436)

#calculate a,b,c,d

a=b=c=d=0

for x in codemap.keys():

if codemap[x][0]==1 and codemap[x][1]==1:# + +

a+=1

elif codemap[x][0]==1 and codemap[x][1]==0:# + -

b+=1

elif codemap[x][0]==0 and codemap[x][1]==1:# + -

c+=1

elif codemap[x][0]==0 and codemap[x][1]==0:# + -

d+=1

return [a,b,c,d]

# main execution starts here

import os

import os.path

import csv

import shutil

import time

import operator

import copy

import scipy

import math

from scipy import stats

import mysql.connector

from mysql.connector import errorcode

startclock=time.clock()

print ("\n"*5)

print("The program starts at: "+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

# read probes

print("Now read probes.")

ifile = open(probefile, "r")

reader = csv.reader(ifile,delimiter='\t')

for row in reader:

try:

n=0

if row[1]=='X':

#print("probes on X found")

n=23

elif row[1]=='Y':

n=24

elif row[1]=='MT':#ignore MT

continue

else:

n=int(row[1])

#note: start size position has been added by 1 as below!!

probes.append([n,int(row[2])])

except:

print(row)

quit(963)

ifile.close()

probes=sorted(probes,key=operator.itemgetter(0,1))

#initiate FirstMarkers and LastMarkers on chromosomes

##FirstMarkersonChromosomes=dict()#First markers of chromosomes

##LastMarkersonChromosomes=dict() #Last markers of chromosomes

n=0

for index,p in enumerate(probes):

if p[0]!=n: #first occation of a chromosome

FirstMarkersonChromosomes[p[0]]=(p[1])

if n!=0: #not just start

LastMarkersonChromosomes[probes[index-1][0]]=probes[index-1][1]

n=p[0]

LastMarkersonChromosomes[probes[-1][0]]=probes[-1][1] #for Y chromosome, which is the last element in the probes list

SegmentsDict=dict()

ScoresDict=dict()

sampleinformationfile=r'C:\Users\ly091\Documents\FixedOPtimalPairs_LY7.15.txt'

for loopcontrol in range(1,4):

if loopcontrol ==1:

#FamilialTumor-FamilialGermline

SampleSize99=ControlSize99=42

Samples = r'C:\Users\ly091\Documents\result8\Loss\FamilialTumor.Loss.centroless.seg'

Samples = r'C:\Users\ly091\Documents\result8\Gain\FamilialTumor.Gain.centroless.seg'

Controls= r'C:\Users\ly091\Documents\result8\Loss\FamilialGermline.Loss.centroless.seg'

Controls= r'C:\Users\ly091\Documents\result8\Gain\FamilialGermline.Gain.centroless.seg'

outputfile=r'C:\Users\ly091\Documents\result8\Loss\FamilialTumor-FamiliarGermline.Loss.mc.gistic'

outputfile=r'C:\Users\ly091\Documents\result8\Gain\FamilialTumor-FamiliarGermline.Gain.mc.gistic'

StatisticsOutput=r'C:\Users\ly091\Documents\result8\Loss\FamilialTumor-FamiliarGermline.Loss.mc.statistics.txt'

StatisticsOutput=r'C:\Users\ly091\Documents\result8\Gain\FamilialTumor-FamiliarGermline.Gain.mc.statistics.txt'

patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\FamilialTumor-FamilialGermline.codes.txt'

elif loopcontrol ==2:

#SporadicTumor - SporadicGermline

SampleSize99=ControlSize99=91

Samples = r'C:\Users\ly091\Documents\result8\Loss\sporadicTumor.Loss.centroless.seg'

Samples = r'C:\Users\ly091\Documents\result8\Gain\sporadicTumor.Gain.centroless.seg'

Controls= r'C:\Users\ly091\Documents\result8\Loss\sporadicGermline.Loss.centroless.seg'

Controls= r'C:\Users\ly091\Documents\result8\Gain\sporadicGermline.Gain.centroless.seg'

outputfile=r'C:\Users\ly091\Documents\result8\Loss\SporadicTumor-SporadicGermline.Loss.mc.gistic'

outputfile=r'C:\Users\ly091\Documents\result8\Gain\SporadicTumor-SporadicGermline.Gain.mc.gistic'

StatisticsOutput=r'C:\Users\ly091\Documents\result8\Loss\SporadicTumor-SporadicGermline.Loss.mc.statistics.txt'

StatisticsOutput=r'C:\Users\ly091\Documents\result8\Gain\SporadicTumor-SporadicGermline.Gain.mc.statistics.txt'

patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\SporadicTumor-SporadicGermline.codes.txt'

elif loopcontrol ==3:

#Tumor-Germline

SampleSize99=ControlSize99=133

Samples = r'C:\Users\ly091\Documents\result8\Loss\tumor.Loss.centroless.seg'

Samples = r'C:\Users\ly091\Documents\result8\Gain\tumor.Gain.centroless.seg'

Controls= r'C:\Users\ly091\Documents\result8\Loss\germline.Loss.centroless.seg'

Controls= r'C:\Users\ly091\Documents\result8\Gain\germline.Gain.centroless.seg'

outputfile=r'C:\Users\ly091\Documents\result8\Loss\Tumor-Germline.Loss.mc.gistic'

outputfile=r'C:\Users\ly091\Documents\result8\Gain\Tumor-Germline.Gain.mc.gistic'

StatisticsOutput=r'C:\Users\ly091\Documents\result8\Loss\Tumor-Germline.Loss.mc.statistics.txt'

StatisticsOutput=r'C:\Users\ly091\Documents\result8\Gain\Tumor-Germline.Gain.mc.statistics.txt'

patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\Tumor-Germline.codes.txt'

else:

print("Error in control.")

exit(444)

print("Now build the code map and the array map...")

#build the code map

codemap=dict()#to store the sample information file

ifile = open(patientcodes, "r")

reader = csv.reader(ifile,delimiter='\t')

for row in reader:

if len(row)==0:

pass

else:

codemap[row[0]]=[0,0]#this is initialized to store the sample and control of this patients have segment in a particular genomic region

#0 is no, 1 is yes

ifile.close()

#build the array map

arraymap=dict()

ifile = open(sampleinformationfile, "r")

reader = csv.reader(ifile,delimiter='\t')

rownum = 0

for row in reader:

if rownum == 0:

rownum+=1

if row[17]!='Sample2':

print("File error!")

quit(883)

else:

sample2=row[17]

if sample2 in codemap:

arraymap[row[0].split('.')[0]]=sample2

else:

arraymap[row[0].split('.')[0]]='unused' #this is a tag for arrays that should not be used.

ifile.close()

if Samples in SegmentsDict:

print("\n*** Use saved results for "+Samples +" ***\n")

SampleSegments=copy.deepcopy(SegmentsDict[Samples])

SampleScores=copy.deepcopy(ScoresDict[Samples])

else:

#read Samples:

print("Now read Samples.")

SampleSegments=list()

allsegments_unsorted=list()

ifile = open(Samples, "r")

reader = csv.reader(ifile,delimiter='\t')

rownum = 0

ndebug=0

for row in reader:

if rownum == 0:

rownum+=1

else:

chromosome=0

if row[1]=='X':

chromosome=23

elif row[1]=='Y':

chromosome=24

else:

chromosome=int(row[1])

# if chromosome==4:

if True:

if ndebug>Recordstoread:

print("Warning: Sample file is not read to the end.")

break

if arraymap[row[0]]!='unused':

ndebug+=1

allsegments_unsorted.append([row[0],chromosome, int(row[2]), int(row[3])])

ifile.close()

allsegments=sorted(allsegments_unsorted,key=operator.itemgetter(0,1,2,3))

array=''

for row in allsegments:

if row[0]!=array:

array=row[0]

newarray=[array,[row]]

SampleSegments.append(newarray)#order: array, [array, chromosome, start, end]

else:

SampleSegments[-1][1].append(row)

SampleSegmentsBackup=copy.deepcopy(SampleSegments)#order: array, [array, chromosome, start, end], it is sorted

cp=0

SliceA=[]

SliceB=[]

pt=list() #list of pointers to the currently working segment in SampleSegments,

#Calculate G-scores for Samples

SampleScores=list() #chromosome, start, end, p-value, G-score

#remember this is for LOH

print("Now Calculate G-Scores for Samples.")

#print(len(SampleSegments))

for row in SampleSegments:

pt.append(0)#initiate the pointers to 0

#Find the slice

while True:

#pre-set SliceA

#print("main while")

empty=True

for index, row in enumerate(SampleSegments):

if pt[index]<len(row[1]):

## print(row[1][pt[index]])

lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2]

if lengthofsegment<2000000:#ignore very long segments:

SliceA=row[1][pt[index]][:] #order: array, chromosome, start, end, just ignore array

empty=False

break

#print(empty)

if empty:

break

## print("\nBefore refinement,SliceA"+str(SliceA))

#refine SliceA

for index,row in enumerate(SampleSegments):

if pt[index]<len(row[1]):

lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2]

## print(lengthofsegment)

if lengthofsegment>=2000000:#ignore very long segments

# pt[index]+=1

continue

SliceB=row[1][pt[index]][:]

## print("\nIn refinement,SliceA:"+str(SliceA))

## print("In refinement, SliceB:"+str(SliceB))

if SliceA[1]<SliceB[1]: #A is before B in different chromosomes

pass

elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes

SliceA=SliceB[:]

pass

else: #same chromosome

Chrom=SliceA[1]

segA=SliceA[2:4]

segB=SliceB[2:4]

re=RelationshipOf(segA,segB)

if re=='equal':

pass

elif re=='no relationship':

if segA[0]<segB[0]: #A is before B

pass

else: #A is after B

SliceA=SliceB[:]

elif re=='cover':

if SliceA[2]<SliceB[2]:

SliceA[3]=SliceB[2]

elif SliceA[2]==SliceB[2]:

SliceA=SliceB[:]

else:

print("This is impossible!")

exit(388)

elif re=='in':

if SliceA[2]>SliceB[2]:

SliceA[2]=segB[0]

SliceA[3]=segA[0]

elif SliceA[2]==SliceB[2]:

pass

else:

print("This is impossible!")

exit(554)

elif re=='overlap':

if SliceA[2]<SliceB[2]: #A before B

SliceA[3]=SliceB[2]

else: #A after B

SliceA[3]=SliceA[2]

SliceA[2]=SliceB[2]

else:

print(re)

print("This is impossible!")

exit(323)

#use SliceA to go through all segments

if cp%EchoPerLines==0:

## if True:

print(str(cp)+": Now working on Samples. SliceA after refinery: "+str(SliceA))

cp+=1

SumSample=0

for index, row in enumerate(SampleSegments):

if pt[index]<len(row[1]):

SliceB=row[1][pt[index]]#segment to be tested

if SliceB[2]==SliceB[3]:#remove 0 length segments from the que.

pt[index]+=1

continue

## print("\nSliceA: "+str(SliceA))

## print("SliceB: "+str(SliceB))

if SliceA[1]<SliceB[1]: #A is before B in different chromosomes

## print("A is before B in different chromosomes.")

continue

elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes

pt[index]+=1

continue

else: #same chromosome

Chrom=Slice

Chrom=SliceA[1]

Saved diffs

Original text

Open file

#LOH
probefile=r'C:\Users\ly091\Documents\probes_SNP6.0_hg18.txt'
##SampleScoreFile = r'C:\Users\ly091\Documents\result8\Loss\tumor.Loss.Score.txt'
##ControlScoreFile = r'C:\Users\ly091\Documents\result8\Loss\Germline.Loss.Score.txt'

GScoreThreshold=0.01#minimum Gscore to be recorded
Recordstoread=10000000#for debug
EchoPerLines=1000#verbose
GaptoMerge=500#minimun width of a gap between two peaks to be merged, unit basepair
MinimumPeakWidth=2000#minimum Peak Width, unit base pair
pvaluelimit=0.01

#Global variables
probes=list()#from the probe file, will be filled and sorted
FirstMarkersonChromosomes=dict()#First markers of chromosomes
LastMarkersonChromosomes=dict() #Last markers of chromosomes

#Functions
def mcnemar(a,b,c,d):
    """
    Input args:
       a, b, c, d- frequencies (a: ++, b: +-, c: -+, d: --) b and c is interchangable
    Output:
       pvalue of test.
    """
    if b+c>0:
        chi2testval = (abs(b-c) - 1)**2/ (b + c)
        df = 1
        pvalue = 1 - stats.chi2.cdf(chi2testval, df)
    else:
        pvalue=1.0
    return pvalue
def AbeforeB(a,b):
    #compare postions of two probes, a and b contain: [chromosome, base pair position]
    if a[0]<b[0]:
        return True
    elif a[0]>b[0]:
        return False
    else: #case: in the same chromosome
        if a[1]<b[1]:
            return True
        elif a[1]>b[1]:
            return False
        else:
            return True #same position, return True here
def LocateProbe(probe):#probe has: chromosome number, start size
    p1=0
    p2=len(probes)-1
    while True:
        pcenter=int((p1+p2)/2)
        #print("p1="+str(p1)+"\tp2="+str(p2)+"\tpcenter="+str(pcenter))
        if probes[pcenter]==probe:
            return pcenter
        elif probes[p1]==probe:
            return p1
        elif probes[p2]==probe:
            return p2
        elif AbeforeB(probes[pcenter],probe):#take the right half
            if p1!=pcenter:
                p1=pcenter
            else:
                print("cannot find probe")
                print(str(probe))
                exit(577)
        else: #take the left half
            if p2!=pcenter:
                p2=pcenter
            else:
                print("cannot find probe")
                print(str(probe))
                exit(577)
    
def LeftTarget(probe):#probe has: chromosome number, start size
##    print(probe)
    p1=0
##    print(len(gtarget_numbers))
    while p1<len(gtarget_numbers):
        if gtarget_numbers[p1][1]==probe[0]:#same chromosome
            #print("goes here")
            if gtarget_numbers[p1][2]>=probe[1]:#first meet, note gtarget_numbers is ordered
                if p1==0:
                    return 0
                else:
                    return p1-1
        p1+=1
    if p1==len(gtarget_numbers):
        return -111111111111

def RightTarget(probe):#probe has: chromosome number, start size
#    print(probe)
    p1=1
    while p1<len(gtarget_numbers):
##        print(gtarget_numbers[-p1])
##        print(p1)
##        print(probe)
        if gtarget_numbers[-p1][1]==probe[0]:#same chromosome
            if gtarget_numbers[-p1][3]<=probe[1]:#first meet from right, note gtarget_numbers is ordered
                if p1==1:
                    return len(gtarget_numbers)-1
                else:
                    return len(gtarget_numbers)-p1+1
        p1+=1
    if p1==len(gtarget_numbers):
        return p1
def CountMarkers(nchr,nstart,nend):
##    print("in countmarkers")
##    print(nchr)
##    print(nstart)
##    print(nend)
    return LocateProbe([nchr,nend])-LocateProbe([nchr,nstart])+1

def PreviousProbeOf(element):
    #element is a list with chromosome and probe position, e.g.[2,242953866]
    n=LocateProbe(element)-1
    if n<0:
        return 0
    else:
        return n
def NextProbeOf(element):
    n= LocateProbe(element)+1
    if n>len(probes)-1:
        return len(probes)-1
    else:
        return n

def RelationshipOf(seg1,seg2):
## function RelationshipOf(seg1 as list, seg2 as list)
## compare two segments seg1 and seg2,
## seg1 and seg2 are two lists, each with the Min and Max values of the segment,
## Min should be the first value in the list and Max the second.
## return a string indicating the relationship between the two segments:
## "equal" : identical
## "no relationship" : seg1 and seg2 are not in the same locus, they don't overlap
## "in": seg1 is entirely within seg2
## "cover": seg1 is bigger than seg2 and completely contains seg2
## "overlap": seg1 partially overlaps with seg2

if seg1[0]==seg2[0] and seg1[1]==seg2[1]:
        return "equal"
    elif seg1[0]>=seg2[0] and seg1[1]<=seg2[1]:
        return "in"
    elif seg1[0]<=seg2[0] and seg1[1]>=seg2[1]:
        return "cover"
    if (seg1[0]<seg2[0] and seg1[1]<seg2[1] and seg1[1]>seg2[0]) or (seg1[0]>seg2[0] and seg1[1]>seg2[1] and seg1[0]<seg2[1]) :
        return "overlap"
    else :
        
        return "no relationship"

def ZeroFillSpaces(StartProbeIndex,EndProbeIndex):
    #return a list of lists each is a properly formated segment with value of 0.0
    #StartProbeINdex and EndProbeIndex are two pointers to positions in the "probes" list which is sorted
    lists_of_zerofills=list()
    #print("dump in ZeroFillSpaces, StartProbeIndex, EndProbeIndex:"+str(StartProbeIndex)+","+str(EndProbeIndex))
    
    if StartProbeIndex< 0 or StartProbeIndex>= len(probes):
        print("Invalid StartProbeIndex in function ZeroFillSpaces: "+str(StartProbeIndex))
        quit(767)
    elif EndProbeIndex <0 or EndProbeIndex>=len(probes):
        print("Invalid EndProbeIndex in function ZeroFillSpaces: "+str(EndProbeIndex))
        quit(768)
    elif StartProbeIndex>=EndProbeIndex:
        return lists_of_zerofills
    else:
        pass
    stChr=probes[StartProbeIndex][0]
    stBase=probes[StartProbeIndex][1]
    endChr=probes[EndProbeIndex][0]
    endBase=probes[EndProbeIndex][1]
    while stChr<=endChr:
        if stChr==endChr:
            if stBase<endBase:
                n=0
                lists_of_zerofills.append([stChr,stBase,endBase,0.0,0.0,0.0,0.0])
            else:
                break
        else:
            endMark=LastMarkersonChromosomes[stChr]
            lists_of_zerofills.append([stChr,stBase,endMark,0.0,0.0,0.0,0.0])
        #move on to next chromosome
        stChr+=1
        if stChr>24:
            break
        else:
            stBase=FirstMarkersonChromosomes[stChr]
            
    return lists_of_zerofills
def getCytobands(chromosome,startn,endn):
    #note: chromosome is 1-22 and X or Y
    try:
    ##  cnx = mysql.connector.connect(user='genome', host='genome-mysql.cse.ucsc.edu',
    ##                                database='hg19')
      cnx = mysql.connector.connect(user='root', password='ssqll', host='127.0.0.1',port='3306',
                                    database='ucsc_hg18')

except mysql.connector.Error as err:
      print("Error occured during obtaining cytoband information from local database.")
      if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
        print("Something is wrong with your user name or password")
      elif err.errno == errorcode.ER_BAD_DB_ERROR:
        print("Database does not exists")
      else:
        print(err)

cursor = cnx.cursor()
    query = ("SELECT name"
               " FROM cytoBand "
               " WHERE chrom=\"chr"+str(chromosome)+"\" "
               " and ((chromStart<="+str(endn)+" AND chromEndStart>="+str(startn)+")"
               " ORDER BY name")
      #print(query)
    cursor.execute(query)
    cytobands=''
    for bandname in cursor:
        cytobands=cytobands+''+bandname
    cursor.close()
    cnx.close()
    return cytobands

def getmatrix(region,SampleSegments,ControlSegments,codemap,arraymap):
    #region is a peak, as a list, in order: chromosome, start, end
    peak=region
    #first initiate codemap to 0
    for x in codemap.keys():
        codemap[x]=[0,0]

for Array in SampleSegments:
        cumulativelength=0
        for row in Array[1]:
            if row[1]==peak[0]:#same chromosome
                segA=[peak[1],peak[2]]
                segB=[row[2],row[3]]
                re=RelationshipOf(segA,segB)
                if re=="no relationship":#do nothing "no relationship"
                    #print("goes here")
                    pass
                elif re=="in" or re=="equal":
                     codemap[arraymap[Array[0]]][0]=1
                     break #this sample has been counted, so no need to check other segments in this sample
                elif re=="cover":
                    sa=segA[1]-segA[0]
                    sb=segB[1]-segB[0]
                    if 10*sb+cumulativelength>sa:  #set the limit how small segB can be compared to segA in "cover"
                         codemap[arraymap[Array[0]]][0]=1
                         break #this sample has been counted, so no need to check other segments in this sample
                    else:
                        cumulativelength+=sb
                        #print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.))
                elif re=="overlap":
                    sa=segA[1]-segA[0]
                    sb=segB[1]-segB[0]
                    if segA[0]<segB[0]: #left overlap
                        sb_overlap=segA[1]-segB[0]
                        sb_outside=segB[1]-segA[1]
                    else:       #right overlap
                        sb_overlap=segB[1]-segA[0]
                        sb_outside=segA[0]-segA[0]
                    #if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap
                    if cumulativelength+10*sb_overlap>sa: #test of substantial overlap
                         codemap[arraymap[Array[0]]][0]=1
                         break #this sample has been counted, so no need to check other segments in this sample
                    else:
                        cumulativelength+=sb_overlap
                        #print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.))
                else:
                    print("Strange relationship: "+re)
                    exit(436)
    for Array in ControlSegments:
        cumulativelength=0
        for row in Array[1]:
            if row[1]==peak[0]:#same chromosome
                segA=[peak[1],peak[2]]
                segB=[row[2],row[3]]
                re=RelationshipOf(segA,segB)
                if re=="no relationship":#do nothing
                    pass
                elif re=="in" or re=="equal":
                     codemap[arraymap[Array[0]]][1]=1
                     break #this Control has been counted, so no need to check other segments in this Control
                elif re=="cover":
                    sa=segA[1]-segA[0]
                    sb=segB[1]-segB[0]
                    if 10*sb+cumulativelength>sa:  #set the limit how small segB can be compared to segA in "cover"
                         codemap[arraymap[Array[0]]][1]=1
                         break #this Control has been counted, so no need to check other segments in this Control
                    else:
                        cumulativelength+=sb
                        #print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.))
                elif re=="overlap":
                    sa=segA[1]-segA[0]
                    sb=segB[1]-segB[0]
                    if segA[0]<segB[0]: #left overlap
                        sb_overlap=segA[1]-segB[0]
                        sb_outside=segB[1]-segA[1]
                    else:       #right overlap
                        sb_overlap=segB[1]-segA[0]
                        sb_outside=segA[0]-segA[0]
                    #if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap
                    if cumulativelength+10*sb_overlap>sa: #test of substantial overlap
                         codemap[arraymap[Array[0]]][1]=1
                         break #this Control has been counted, so no need to check other segments in this Control
                    else:
                        cumulativelength+=sb_overlap
                        #print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.))
                else:
                    print("Strange relationship: "+re)
                    exit(436)
    #calculate a,b,c,d
    a=b=c=d=0
    for x in codemap.keys():
        if codemap[x][0]==1 and codemap[x][1]==1:# + +
            a+=1
        elif codemap[x][0]==1 and codemap[x][1]==0:# + -
            b+=1
        elif codemap[x][0]==0 and codemap[x][1]==1:# + -
            c+=1
        elif codemap[x][0]==0 and codemap[x][1]==0:# + -
            d+=1
    return [a,b,c,d]

# main execution starts here

import os
import os.path
import csv
import shutil
import time
import operator
import copy
import scipy
import math
from scipy import stats
import mysql.connector
from mysql.connector import errorcode

startclock=time.clock()
print ("\n"*5)
print("The program starts at: "+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

# read probes
print("Now read probes.")
ifile  = open(probefile, "r")
reader = csv.reader(ifile,delimiter='\t')
for row in reader:
    try:
        n=0
        if row[1]=='X':
            #print("probes on X found")
            n=23
        elif row[1]=='Y': 
            n=24
        elif row[1]=='MT':#ignore MT
            continue
        else:
            n=int(row[1])
        #note: start size position has been added by 1 as below!!
        probes.append([n,int(row[2])])
    except:
        print(row)
        quit(963)
ifile.close()
probes=sorted(probes,key=operator.itemgetter(0,1))

#initiate FirstMarkers and LastMarkers on chromosomes
##FirstMarkersonChromosomes=dict()#First markers of chromosomes
##LastMarkersonChromosomes=dict() #Last markers of chromosomes
n=0
for index,p in enumerate(probes):
    if p[0]!=n: #first occation of a chromosome
        FirstMarkersonChromosomes[p[0]]=(p[1])
        if n!=0: #not just start
            LastMarkersonChromosomes[probes[index-1][0]]=probes[index-1][1]
        n=p[0]
LastMarkersonChromosomes[probes[-1][0]]=probes[-1][1] #for Y chromosome, which is the last element in the probes list

SegmentsDict=dict()
ScoresDict=dict()

sampleinformationfile=r'C:\Users\ly091\Documents\FixedOPtimalPairs_LY7.15.txt'
for loopcontrol in range(1,4):
    if loopcontrol ==1:        
        #FamilialTumor-FamilialGermline
        SampleSize99=ControlSize99=42
        Samples = r'C:\Users\ly091\Documents\result8\Loss\FamilialTumor.Loss.centroless.seg'
        Controls= r'C:\Users\ly091\Documents\result8\Loss\FamilialGermline.Loss.centroless.seg'
        outputfile=r'C:\Users\ly091\Documents\result8\Loss\FamilialTumor-FamiliarGermline.Loss.mc.gistic'
        StatisticsOutput=r'C:\Users\ly091\Documents\result8\Loss\FamilialTumor-FamiliarGermline.Loss.mc.statistics.txt'
        patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\FamilialTumor-FamilialGermline.codes.txt'
    elif loopcontrol ==2:        
        #SporadicTumor - SporadicGermline
        SampleSize99=ControlSize99=91
        Samples = r'C:\Users\ly091\Documents\result8\Loss\sporadicTumor.Loss.centroless.seg'
        Controls= r'C:\Users\ly091\Documents\result8\Loss\sporadicGermline.Loss.centroless.seg'
        outputfile=r'C:\Users\ly091\Documents\result8\Loss\SporadicTumor-SporadicGermline.Loss.mc.gistic'
        StatisticsOutput=r'C:\Users\ly091\Documents\result8\Loss\SporadicTumor-SporadicGermline.Loss.mc.statistics.txt'
        patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\SporadicTumor-SporadicGermline.codes.txt'
    elif loopcontrol ==3:        
        #Tumor-Germline
        SampleSize99=ControlSize99=133
        Samples = r'C:\Users\ly091\Documents\result8\Loss\tumor.Loss.centroless.seg'
        Controls= r'C:\Users\ly091\Documents\result8\Loss\germline.Loss.centroless.seg'
        outputfile=r'C:\Users\ly091\Documents\result8\Loss\Tumor-Germline.Loss.mc.gistic'
        StatisticsOutput=r'C:\Users\ly091\Documents\result8\Loss\Tumor-Germline.Loss.mc.statistics.txt'
        patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\Tumor-Germline.codes.txt'
    else:
        print("Error in control.")
        exit(444)

print("Now build the code map and the array map...")
    #build the code map
    codemap=dict()#to store the sample information file
    ifile  = open(patientcodes, "r")
    reader = csv.reader(ifile,delimiter='\t')
    for row in reader:
        if len(row)==0:
            pass
        else:
            codemap[row[0]]=[0,0]#this is initialized to store the sample and control of this patients have segment in a particular genomic region
                            #0 is no, 1 is yes
    ifile.close()

#build the array map
    arraymap=dict()
    ifile  = open(sampleinformationfile, "r")
    reader = csv.reader(ifile,delimiter='\t')
    rownum = 0
    for row in reader:
        if rownum == 0:
            rownum+=1
            if row[17]!='Sample2':
                print("File error!")
                quit(883)
        else:
            sample2=row[17]
            if sample2 in codemap:
                arraymap[row[0].split('.')[0]]=sample2
            else:
                arraymap[row[0].split('.')[0]]='unused' #this is a tag for arrays that should not be used.
    ifile.close()

if Samples in SegmentsDict:
        print("\n***  Use saved results for "+Samples +"  ***\n")
        SampleSegments=copy.deepcopy(SegmentsDict[Samples])
        SampleScores=copy.deepcopy(ScoresDict[Samples])
    else:
        #read Samples:
        print("Now read Samples.")
        SampleSegments=list()
        allsegments_unsorted=list()
        ifile  = open(Samples, "r")
        reader = csv.reader(ifile,delimiter='\t')
        rownum = 0
        ndebug=0
        for row in reader:
            if rownum == 0:
                rownum+=1
            else:
                chromosome=0
                if row[1]=='X':
                    chromosome=23
                elif row[1]=='Y':
                    chromosome=24
                else:
                    chromosome=int(row[1])
        #        if chromosome==4:
                if True:
                    if ndebug>Recordstoread:
                        print("Warning: Sample file is not read to the end.")
                        break
                    if arraymap[row[0]]!='unused':
                        ndebug+=1
                        allsegments_unsorted.append([row[0],chromosome, int(row[2]), int(row[3])])
        ifile.close()
        allsegments=sorted(allsegments_unsorted,key=operator.itemgetter(0,1,2,3))
        array=''
        for row in allsegments:
            if row[0]!=array:
                array=row[0]
                newarray=[array,[row]]
                SampleSegments.append(newarray)#order: array, [array, chromosome, start, end]
            else:
                SampleSegments[-1][1].append(row)
        SampleSegmentsBackup=copy.deepcopy(SampleSegments)#order: array, [array, chromosome, start, end], it is sorted
        cp=0
        SliceA=[]
        SliceB=[]
        pt=list() #list of pointers to the currently working segment in SampleSegments, 
        #Calculate G-scores for Samples
        SampleScores=list()  #chromosome, start, end, p-value, G-score
        #remember this is for LOH
        print("Now Calculate G-Scores for Samples.")

#print(len(SampleSegments))
        for row in SampleSegments:
            pt.append(0)#initiate the pointers to 0

#Find the slice
        while True:
            #pre-set SliceA
            #print("main while")
            empty=True
            for index, row in enumerate(SampleSegments):
                if pt[index]<len(row[1]):
        ##            print(row[1][pt[index]])
                    lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2]
                    if lengthofsegment<2000000:#ignore very long segments:
                        SliceA=row[1][pt[index]][:] #order: array, chromosome, start, end, just ignore array
                        empty=False
                        break
            #print(empty)
            if empty:
                break
        ##    print("\nBefore refinement,SliceA"+str(SliceA))

#refine SliceA
            
            for index,row in enumerate(SampleSegments):
                if pt[index]<len(row[1]):
                    lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2]
        ##            print(lengthofsegment)
                    if lengthofsegment>=2000000:#ignore very long segments
                       # pt[index]+=1
                        continue
                    SliceB=row[1][pt[index]][:]
        ##            print("\nIn refinement,SliceA:"+str(SliceA))
        ##            print("In refinement, SliceB:"+str(SliceB))
                    if SliceA[1]<SliceB[1]: #A is before B in different chromosomes
                        pass
                    elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes
                        SliceA=SliceB[:]
                        pass
                    else: #same chromosome
                        Chrom=SliceA[1]
                        segA=SliceA[2:4]
                        segB=SliceB[2:4]
                        re=RelationshipOf(segA,segB)
                    
                        if re=='equal':
                            pass
                        elif re=='no relationship':
                            if segA[0]<segB[0]: #A is before B
                                pass
                            else: #A is after B
                                SliceA=SliceB[:]
                        elif re=='cover':
                            if SliceA[2]<SliceB[2]:
                                SliceA[3]=SliceB[2]
                            elif SliceA[2]==SliceB[2]:
                                SliceA=SliceB[:]
                            else:
                                print("This is impossible!")
                                exit(388)
                        elif re=='in':
                            if SliceA[2]>SliceB[2]:
                                SliceA[2]=segB[0]
                                SliceA[3]=segA[0]
                            elif SliceA[2]==SliceB[2]:
                                pass
                            else:
                                print("This is impossible!")
                                exit(554)
                        elif re=='overlap':
                            if SliceA[2]<SliceB[2]: #A before B
                                SliceA[3]=SliceB[2]
                            else: #A after B
                                SliceA[3]=SliceA[2]
                                SliceA[2]=SliceB[2]
                        else:
                            print(re)
                            print("This is impossible!")
                            exit(323)

#use SliceA to go through all segments
            if cp%EchoPerLines==0:
        ##    if True:
                print(str(cp)+": Now working on Samples. SliceA after refinery: "+str(SliceA))
            cp+=1
            SumSample=0
            for index, row in enumerate(SampleSegments):
                if pt[index]<len(row[1]):
                    SliceB=row[1][pt[index]]#segment to be tested
                    if SliceB[2]==SliceB[3]:#remove 0 length segments from the que.
                        pt[index]+=1
                        continue
        ##            print("\nSliceA: "+str(SliceA))
        ##            print("SliceB: "+str(SliceB))
                    if SliceA[1]<SliceB[1]: #A is before B in different chromosomes
        ##                print("A is before B in different chromosomes.")
                        continue
                    elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes
                        pt[index]+=1
                        continue
                    else: #same chromosome
                        Chrom=SliceA[1]
                        segA=SliceA[2:4]
                        segB=SliceB[2:4]
                        re=RelationshipOf(segA,segB)
        ##                print(re)
                        if re=='equal':
                            SumSample=SumSample+1 #1.0 is for LOH only because no amplitude information is used for LOH
                            pt[index]+=1 #move the pointer forward by 1
                        elif re=='no relationship':
                            if segA[0]<segB[0]: #A is before B
                                continue
                            else: #A is after B
                                pt[index]+=1
                                continue
                        elif re=='cover':
                                print("This is impossible!")
                                print("\nSliceA: "+str(SliceA))
                                print("SliceB: "+str(SliceB))
                                exit(991)
                        elif re=='in':
                            SumSample=SumSample+1 #1.0 is for LOH only because no amplitude information is used for LOH
                            if SliceB[3]>SliceA[3]:
                                SliceB[2]=SliceA[3]
                            else:
                                pt[index]+=1
                        elif re=='overlap':
                            SumSample+=1
                            if SliceA[2]<SliceB[2]:
                                SliceB[2]=SliceA[3]
                            else:
                                pt[index]+=1
                        else:
                            print("This is impossible!")
                            exit(324)
            SliceA.append(0.0)
            SliceA.append(SumSample)#append p value and frequency of LOH
        ##    print("After summation,SliceA"+str(SliceA))
            SampleScores.append(SliceA)
        SampleSegments=SampleSegmentsBackup # SampleSegments restored, segments are sorted
        if Samples not in SegmentsDict:
            SegmentsDict[Samples]=copy.deepcopy(SampleSegments)
            ScoresDict[Samples]=copy.deepcopy(SampleScores)
    
    if Controls in SegmentsDict:
        print("\n***  Use saved results for "+Controls +"  ***\n")
        ControlSegments=copy.deepcopy(SegmentsDict[Controls])
        ControlScores=copy.deepcopy(ScoresDict[Controls])
    else:
        #read Controls:
        print("Now read Controls.")
        ControlSegments=list()
        allsegments_unsorted=list()
        ifile  = open(Controls, "r")
        reader = csv.reader(ifile,delimiter='\t')
        rownum = 0
        ndebug=0
        for row in reader:
            if rownum == 0:
                rownum+=1
            else:
                chromosome=0
                if row[1]=='X':
                    chromosome=23
                elif row[1]=='Y':
                    chromosome=24
                else:
                    chromosome=int(row[1])
        #        if chromosome==4:
                if True:
                    if ndebug>Recordstoread:
                        print("Warning: Sample file is not read to the end.")
                        break
                    if arraymap[row[0]]!='unused':
                        ndebug+=1
                        allsegments_unsorted.append([row[0],chromosome, int(row[2]), int(row[3])])
        ifile.close()
        allsegments=sorted(allsegments_unsorted,key=operator.itemgetter(0,1,2,3))
        array=''
        for row in allsegments:
            if row[0]!=array:
                array=row[0]
                newarray=[array,[row]]
                ControlSegments.append(newarray)
            else:
                ControlSegments[-1][1].append(row)
        ControlSegmentsBackup=copy.deepcopy(ControlSegments)#order: array, [array, chromosome, start, end], it is sorted

cp=0
        SliceA=[]
        SliceB=[]
        pt=list() #list of pointers to the currently working segment in ControlSegments, 
        #Calculate G-scores for Controls
        ControlScores=list()  #chromosome, start, end, p-value, G-score
        #remember this is for LOH
        print("Now Calculate G-Scores for Controls.")

#print(len(ControlSegments))
        for row in ControlSegments:
            pt.append(0)#initiate the pointers to 0

#Find the slice
        while True:
            #pre-set SliceA
            #print("main while")
            empty=True
            for index, row in enumerate(ControlSegments):
                if pt[index]<len(row[1]):
        ##            print(row[1][pt[index]])
                    lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2]
                    if lengthofsegment<2000000:#ignore very long segments:
                        SliceA=row[1][pt[index]][:] #order: array, chromosome, start, end, just ignore array
                        empty=False
                        break
            #print(empty)
            if empty:
                break
        ##    print("\nBefore refinement,SliceA"+str(SliceA))

#refine SliceA
            
            for index,row in enumerate(ControlSegments):
                if pt[index]<len(row[1]):
                    lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2]
        ##            print(lengthofsegment)
                    if lengthofsegment>=2000000:#ignore very long segments
                       # pt[index]+=1
                        continue
                    SliceB=row[1][pt[index]][:]
        ##            print("\nIn refinement,SliceA:"+str(SliceA))
        ##            print("In refinement, SliceB:"+str(SliceB))
                    if SliceA[1]<SliceB[1]: #A is before B in different chromosomes
                        pass
                    elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes
                        SliceA=SliceB[:]
                        pass
                    else: #same chromosome
                        Chrom=SliceA[1]
                        segA=SliceA[2:4]
                        segB=SliceB[2:4]
                        re=RelationshipOf(segA,segB)
                    
                        if re=='equal':
                            pass
                        elif re=='no relationship':
                            if segA[0]<segB[0]: #A is before B
                                pass
                            else: #A is after B
                                SliceA=SliceB[:]
                        elif re=='cover':
                            if SliceA[2]<SliceB[2]:
                                SliceA[3]=SliceB[2]
                            elif SliceA[2]==SliceB[2]:
                                SliceA=SliceB[:]
                            else:
                                print("This is impossible!")
                                exit(388)
                        elif re=='in':
                            if SliceA[2]>SliceB[2]:
                                SliceA[2]=segB[0]
                                SliceA[3]=segA[0]
                            elif SliceA[2]==SliceB[2]:
                                pass
                            else:
                                print("This is impossible!")
                                exit(554)
                        elif re=='overlap':
                            if SliceA[2]<SliceB[2]: #A before B
                                SliceA[3]=SliceB[2]
                            else: #A after B
                                SliceA[3]=SliceA[2]
                                SliceA[2]=SliceB[2]
                        else:
                            print(re)
                            print("This is impossible!")
                            exit(323)

#use SliceA to go through all segments
            if cp%EchoPerLines==0:
        ##    if True:
                print(str(cp)+": Now working on Controls. SliceA after refinery: "+str(SliceA))
            cp+=1
            SumControl=0
            for index, row in enumerate(ControlSegments):
                if pt[index]<len(row[1]):
                    SliceB=row[1][pt[index]]#segment to be tested
                    if SliceB[2]==SliceB[3]:#remove 0 length segments from the que.
                        pt[index]+=1
                        continue
        ##            print("\nSliceA: "+str(SliceA))
        ##            print("SliceB: "+str(SliceB))
                    if SliceA[1]<SliceB[1]: #A is before B in different chromosomes
        ##                print("A is before B in different chromosomes.")
                        continue
                    elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes
                        pt[index]+=1
                        continue
                    else: #same chromosome
                        Chrom=SliceA[1]
                        segA=SliceA[2:4]
                        segB=SliceB[2:4]
                        re=RelationshipOf(segA,segB)
        ##                print(re)
                        if re=='equal':
                            SumControl=SumControl+1 #1.0 is for LOH only because no amplitude information is used for LOH
                            pt[index]+=1 #move the pointer forward by 1
                        elif re=='no relationship':
                            if segA[0]<segB[0]: #A is before B
                                continue
                            else: #A is after B
                                pt[index]+=1
                                continue
                        elif re=='cover':
                                print("This is impossible!")
                                print("\nSliceA: "+str(SliceA))
                                print("SliceB: "+str(SliceB))
                                exit(991)
                        elif re=='in':
                            SumControl=SumControl+1 #1.0 is for LOH only because no amplitude information is used for LOH
                            if SliceB[3]>SliceA[3]:
                                SliceB[2]=SliceA[3]
                            else:
                                pt[index]+=1
                        elif re=='overlap':
                            SumControl+=1
                            if SliceA[2]<SliceB[2]:
                                SliceB[2]=SliceA[3]
                            else:
                                pt[index]+=1
                        else:
                            print("This is impossible!")
                            exit(324)
            SliceA.append(0.0)
            SliceA.append(SumControl)#append p value and frequency of LOH
        ##    print("After summation,SliceA"+str(SliceA))
            ControlScores.append(SliceA)

ControlSegments=ControlSegmentsBackup # ControlSegments restored, segments are sorted
        if Controls not in SegmentsDict:
            SegmentsDict[Controls]=copy.deepcopy(ControlSegments)
            ScoresDict[Controls]=copy.deepcopy(ControlScores)

#Merge SampleScores and ControlScores, calculate subtracted G-score, calculate p-value using Mcnemar's test

nSamples=len(SampleSegments)
    nControls=len(ControlSegments)
    FinalScores=list()

##  The 2x2 contigency table for Fisher's Exact test
    ##
    ##              with CNA        without CNA
    ##  Sample      a               b
    ##  Control     c               d

print("\nNow merge Samples and Controls.")
    while len(SampleScores)>0 and len(ControlScores)>0:
        SliceA=SampleScores.pop(0) #order: array, chromosome, start, end, p-value, sum, just ignore array
        SliceB=ControlScores.pop(0) #The naming of SliceA and SliceB is for convenience
        if SliceA[1]<SliceB[1]: #A is before B in different chromosomes
#def getmatrix(region,SampleSegments,ControlSegments,codemap,arraymap):
            region=SliceA[1:4]
##            print(region)
            M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap)
##            print(M)
            a=M[0]
            b=M[1]
            c=M[2]
            d=M[3]
            pvalue=mcnemar(a,b,c,d)
            logp=-math.log10(pvalue)
    ##        if logp<0.000001:
    ##            print("a="+str(a)+", b="+str(b)+", c="+str(c)+", d="+str(d))
            
            newscore=SliceA[1:4]
            newscore.append(logp)
            x=SliceA[5]*1.0/nSamples
            newscore.append(x)
            if x>GScoreThreshold and pvalue<0.05:
                FinalScores.append(newscore)
            ControlScores.insert(0,SliceB) #push back B
        elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes
            #This is not interesting, ignored
            SampleScores.insert(0,SliceA) #push back A
        else: #same chromosome
            Chrom=SliceA[1]
            segA=SliceA[2:4]
            segB=SliceB[2:4]
            re=RelationshipOf(segA,segB)
            if re=='equal' or re=='cover' or re=='in' or re=='overlap' :
                
                region=SliceA[1:4]
##                print(region)
                M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap)
                a=M[0]
                b=M[1]
                c=M[2]
                d=M[3]
                pvalue=mcnemar(a,b,c,d)
                if pvalue<1e-10:
                    pvalue=1e-10
                logp=-math.log10(pvalue)
                newscore=SliceA[1:4]
                newscore.append(logp)
                x=SliceA[5]*1.0/nSamples-SliceB[5]*1.0/nControls
                newscore.append(x)
                if x>GScoreThreshold and pvalue<0.05:
                    FinalScores.append(newscore)
            elif re=='no relationship':
                if segA[0]<segB[0]: #A is before B
                    
                    region=SliceA[1:4]
##                    print(region)
                    M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap)
                    a=M[0]
                    b=M[1]
                    c=M[2]
                    d=M[3]
                    pvalue=mcnemar(a,b,c,d)
                    logp=-math.log10(pvalue)
                    newscore=SliceA[1:4]
                    newscore.append(logp)
                    x=SliceA[5]*1.0/nSamples
    #                print("a="+str(a)+", nSamples="+str(nSamples)+", x="+str(x))
                    newscore.append(x)
                    if x>GScoreThreshold and pvalue<0.05:
                        FinalScores.append(newscore)
                    ControlScores.insert(0,SliceB) #push back B
                else: #A is after B
                    #This is not interesting, ignored
                    SampleScores.insert(0,SliceA) #push back A
            else:
                print("This is impossible!")
                exit(325)
    if len(SampleScores)>0:
        while len(SampleScores)>0 :
            SliceA=SampleScores.pop(0) #order: array, chromosome, start, end, p-value, sum, just ignore array
            region=SliceA[1:4]
            M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap)
            a=M[0]
            b=M[1]
            c=M[2]
            d=M[3]
            pvalue=mcnemar(a,b,c,d)
            logp=-math.log10(pvalue)
            newscore=SliceA[1:4]
            newscore.append(logp)
            x=SliceA[5]*1.0/nSamples
    #        print("a="+str(a)+", nSamples="+str(nSamples)+", x="+str(x))
            newscore.append(x)
            if x>GScoreThreshold and pvalue<0.05:
                FinalScores.append(newscore)
    ##print("\n\n\n")
    ##for debugrowtoprint in FinalScores:
    ##    print(debugrowtoprint)

#print(FinalScores)
    #Merge nearby peaks, critical parameter: GaptoMerge, MinimumPeakWidth
    print("Now merge nearby peaks.")
    MergedFinalScores=list()
    CurrentPeak=list()
    for row in FinalScores:
        if len(CurrentPeak)==0:#first peak
            CurrentPeak=row[:]
        else:
            if CurrentPeak[0]!=row[0]:#different chromosome
                MergedFinalScores.append(CurrentPeak)#save CurrentPeak
                CurrentPeak=row[:] #move on
            else:
                n=row[1]-CurrentPeak[2]
                if n<=GaptoMerge:#merge peaks
                    CurrentPeak[2]=row[2]
                else:# do not merge
                    MergedFinalScores.append(CurrentPeak)#save CurrentPeak
                    CurrentPeak=row[:]
    if len(CurrentPeak)>0:
        MergedFinalScores.append(CurrentPeak)#save the last CurrentPeak

print("Now remove peaks less then "+str(MinimumPeakWidth) +" bp.")
    FinalScores=list() #FinalScores will be restored here
    #print(len(MergedFinalScores))
    for row in MergedFinalScores:
        print(row)
        if row[2]-row[1]>=MinimumPeakWidth:
            FinalScores.append(row)

#recalculate GScore and P value
     #SampleSegments and ControlSegments, the order: array, [array, chromosome, start, end], segments are sorted

print("Now recalculate the GScore and P value.")
    NewFinalScores=list()

ofile=open(StatisticsOutput,'w',newline='')
    stat_writer=csv.writer(ofile, delimiter='\t')
    header=['chromosome','Start','End','kbp','a','b','c','d','pvalue']
    stat_writer.writerow(header)

for peak in FinalScores:
        #print(peak)
        region=peak[0:3]
        M=getmatrix(region, SampleSegments,ControlSegments,codemap, arraymap)
        print(M)
        a=M[0]
        b=M[1]
        c=M[2]
        d=M[3]
        pvalue=mcnemar(a,b,c,d)
        if pvalue<1e-10:
            print('Warnin: extremely small pvalue found',pvalue,'assigned 1e-10')
            pvalue=1e-10
        logp=-math.log10(pvalue)
        
        if pvalue<pvaluelimit : #1.3 is p=0.05, 2 is for p=0.01, 3 for p=0.001
            #c/d is the odds ratio of control
            peak[3]=logp
            peak[4]=x
            NewFinalScores.append(peak)
            #header=['chromosome','Start','End','kbp','a','b','c','d']
            stat_writer.writerow([peak[0],peak[1],peak[2],(peak[2]-peak[1])/1000.0,a,b,c,d,float("%.2e"%pvalue)])
    FinalScores=NewFinalScores#restore FinalScores

ofile.close()
                
    #zero-fill the spaces
    print("Now zero fill.")
    ptrProbe=0 #initialize the pointer
    FinalScoresZeros=list()
    for row in FinalScores:
     #   print("row in FinalScores : "+str(row))
        zl=ZeroFillSpaces(ptrProbe,PreviousProbeOf([row[0],row[1]]))
        for z in zl:
            FinalScoresZeros.append(z)
        FinalScoresZeros.append(row)
        ptrProbe=NextProbeOf([row[0],row[2]])
    #after looping the last loh, seal the "zero fill"
    #print("Now seal the zero fill...")
    zl=ZeroFillSpaces(ptrProbe,len(probes)-1)
    for z in zl:
        FinalScoresZeros.append(z)

#log

print("logging...")
    ofile=open(outputfile,'w',newline='')
    writer=csv.writer(ofile, delimiter='\t')
    header=['Type','Chromosome','Start','End','-log10(q-value)','G-score','average','amplitute','frequency']
    writer.writerow(header)
    for row in FinalScoresZeros: #order: chromosome, start, end, p-value, sum
        if row[0]==23:
            row[0]='X'
        elif row[0]==24:
            row[0]='Y'
        row.insert(0,"Del")
        row.append(0.0)
        row.append(0.0)
        row.append(0.0)
        writer.writerow(row)
    ofile.close()

print ("Analysis is finished successfully! ")
endclock=time.clock()
print("The program ends at: "+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
print("%.1f"%(endclock-startclock)+" seconds used.")

Changed text

Open file

#LOH
probefile=r'C:\Users\ly091\Documents\probes_SNP6.0_hg18.txt'
##SampleScoreFile = r'C:\Users\ly091\Documents\result8\Gain\tumor.Gain.Score.txt'
##ControlScoreFile = r'C:\Users\ly091\Documents\result8\Gain\Germline.Gain.Score.txt'

# main execution starts here

import os
import os.path
import csv
import shutil
import time
import operator
import copy
import scipy
import math
from scipy import stats
import mysql.connector
from mysql.connector import errorcode

startclock=time.clock()
print ("\n"*5)
print("The program starts at: "+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

SegmentsDict=dict()
ScoresDict=dict()

sampleinformationfile=r'C:\Users\ly091\Documents\FixedOPtimalPairs_LY7.15.txt'
for loopcontrol in range(1,4):
    if loopcontrol ==1:        
        #FamilialTumor-FamilialGermline
        SampleSize99=ControlSize99=42
        Samples = r'C:\Users\ly091\Documents\result8\Gain\FamilialTumor.Gain.centroless.seg'
        Controls= r'C:\Users\ly091\Documents\result8\Gain\FamilialGermline.Gain.centroless.seg'
        outputfile=r'C:\Users\ly091\Documents\result8\Gain\FamilialTumor-FamiliarGermline.Gain.mc.gistic'
        StatisticsOutput=r'C:\Users\ly091\Documents\result8\Gain\FamilialTumor-FamiliarGermline.Gain.mc.statistics.txt'
        patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\FamilialTumor-FamilialGermline.codes.txt'
    elif loopcontrol ==2:        
        #SporadicTumor - SporadicGermline
        SampleSize99=ControlSize99=91
        Samples = r'C:\Users\ly091\Documents\result8\Gain\sporadicTumor.Gain.centroless.seg'
        Controls= r'C:\Users\ly091\Documents\result8\Gain\sporadicGermline.Gain.centroless.seg'
        outputfile=r'C:\Users\ly091\Documents\result8\Gain\SporadicTumor-SporadicGermline.Gain.mc.gistic'
        StatisticsOutput=r'C:\Users\ly091\Documents\result8\Gain\SporadicTumor-SporadicGermline.Gain.mc.statistics.txt'
        patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\SporadicTumor-SporadicGermline.codes.txt'
    elif loopcontrol ==3:        
        #Tumor-Germline
        SampleSize99=ControlSize99=133
        Samples = r'C:\Users\ly091\Documents\result8\Gain\tumor.Gain.centroless.seg'
        Controls= r'C:\Users\ly091\Documents\result8\Gain\germline.Gain.centroless.seg'
        outputfile=r'C:\Users\ly091\Documents\result8\Gain\Tumor-Germline.Gain.mc.gistic'
        StatisticsOutput=r'C:\Users\ly091\Documents\result8\Gain\Tumor-Germline.Gain.mc.statistics.txt'
        patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\Tumor-Germline.codes.txt'
    else:
        print("Error in control.")
        exit(444)