1  
 
  2  '''
 
  3  Created 2012
 
  4  
 
  5  Contains various help functions which read or produce an input/ output
 
  6  
 
  7  
 
  8  @author: Sven Giese
 
  9  ''' 
 10  import os 
 11  import random 
 12  import HTSeq 
 13  
 
 14  
 
 16      """
 
 17      Reads in the dna sequence of the given fasta
 
 18  
 
 19      @type  filename: string
 
 20      @param filename: Fasta-file used as input.
 
 21      @rtype:   HTSeq Sequence object
 
 22      @return:  Reference Fasta.
 
 23      """ 
 24      chr = HTSeq.FastaReader(filename) 
 25      for fasta in chr: 
 26          referenz = HTSeq.Sequence(fasta.seq,fasta.name) 
 27      return(referenz) 
  28  
 
 29  
 
 31      """
 
 32      Writes a given sequence object to a fasta file.
 
 33  
 
 34      @type  sequenceObject: HTSeq Sequence object
 
 35      @param sequenceObject: Reference sequence as fasta.
 
 36      """ 
 37      
 
 38      outfasta = open(filename,"w") 
 39      sequenceObject.write_to_fasta_file(outfasta) 
 40      outfasta.close() 
  41  
 
 42  
 
 44      """
 
 45      Creates the "delta" file for the comparison of the two chromosoms. This file contains the differences in nucleotide distribution between reference and artificial.
 
 46      input: nucleotid dictionary genom, aa dictionary genome, nucleotid dictionary artificial chromosom, aa dictionary, filename 
 
 47  
 
 48      @type  Ndic_G: dictionary
 
 49      @param Ndic_G: Nucleotid dictionary genom.
 
 50      @type  aadic_G: dictionary
 
 51      @param aadic_G: AA dictionary genome.
 
 52      @type  Ndic_AR: dictionary
 
 53      @param Ndic_AR: Nucleotid dictionary artificial.
 
 54      @type  aadic_AR: dictionary
 
 55      @param aadic_AR: AA dictionary artificial
 
 56      @type  filename: string
 
 57      @param filename: Output filename.
 
 58      """ 
 59      fobj = open(filename,"w") 
 60      fobj.write("NUC /AA \t Genom \t Artificial Reference \t Delta \n") 
 61     
 
 62      sum1 =0 
 63      sum2= 0 
 64      for item in Ndic_G.keys(): 
 65          fobj.write(item +"\t"+str(Ndic_G[item])+"\t"+str(Ndic_AR[item])+"\t"+str(Ndic_G[item]-Ndic_AR[item])+"\n") 
 66          sum1 +=abs(Ndic_G[item]-Ndic_AR[item]) 
 67      fobj.write(str(sum1)+"\n") 
 68      
 
 69      for item in aadic_G.keys(): 
 70          fobj.write(item +"\t"+str(aadic_G[item])+"\t"+str(aadic_AR[item])+"\t"+str(aadic_G[item]-aadic_AR[item])+"\n") 
 71          sum2 +=abs(aadic_G[item]-aadic_AR[item]) 
 72      fobj.write(str(sum2)+"\n") 
  73      
 
 74      
 
 75      
 
 76  
 
 78      """
 
 79      Writes the nucleotide distribution in a file and returns the dictionary. adjust s for % results.
 
 80      @type  seq: string
 
 81      @param seq: Nucleotide sequence.
 
 82      @type  txt_file: string
 
 83      @param txt_file: Output compare file.
 
 84      @type  shallwrite: Bool
 
 85      @param shallwrite: Decides if percentages values are written to the output.
 
 86      """ 
 87      Nndic={"A":0,"C":0,"G":0,"T":0,"N":0} 
 88      
 
 89      for i in range(0,len(seq)): 
 90            Nndic[seq[i]]+=1 
 91      s=len(seq) 
 92      s=1 
 93     
 
 94      if (shallwrite==1): 
 95          output_file=open(txt_file,'w') 
 96          for item in Nndic.keys(): 
 97              Nndic[item]=Nndic[item]/float(s) 
 98              output_file.write(item + "\t" + str(Nndic[item])+"\n") 
 99              
 
100          output_file.close() 
101      else: 
102           for item in Nndic.keys(): 
103              Nndic[item]=Nndic[item]/float(s) 
104      return (Nndic)     
 105                                                                                      
 
106  
 
107  
 
109      """
 
110      Writes the AA distribution in a file and returns the dictionary. adjust s for % results.
 
111      @type  seq: string
 
112      @param seq: Nucleotide sequence.
 
113      @type  txt_file: string
 
114      @param txt_file: Output compare file.
 
115      @type  shallwrite: Bool
 
116      @param shallwrite: Write output in percentages..
 
117      """ 
118      aadic = {"A":0,"R":0,"N":0,"D":0,"C":0,"E":0,"Q":0,"G":0,"H":0,"I":0,"L":0,"K":0,"M":0,"F":0,"P":0,"S":0,"T":0,"W":0,"Y":0,"V":0,"*":0} 
119      for i in range(0,len(seq)): 
120          
 
121          '''escape 'n' Sequences ''' 
122          if (seq[i] in aadic): 
123                aadic[seq[i]]+=1 
124          else: 
125              continue 
126              
 
127      
 
128      n = len(seq) 
129      n=1 
130      if (shallwrite==1): 
131          output_file=open(txt_file,'w') 
132          for item in aadic.keys(): 
133              aadic[item]=aadic[item]/float(n) 
134              output_file.write(item + "\t" + str(aadic[item])+"\n") 
135              
 
136          output_file.close() 
137      else: 
138          for item in aadic.keys(): 
139              aadic[item]=aadic[item]/float(n) 
140              
 
141      return (aadic)  
 142  
 
143  '''
 
144  input: DNA Sequence, outputfilename and 1/0 for writing/not writing outputfile ''' 
145  
 
147      """
 
148      Writes the DNA distribution in a file and returns the dictionary. adjust n for % results
 
149  
 
150      @type  file_fasta: string
 
151      @param file_fasta: DNA Sequence
 
152      @type  txt_file: string
 
153      @param txt_file: Filename for output.
 
154      """ 
155      input_file=open(file_fasta,'r') 
156      output_file=open(txt_file,'a') 
157      seq='' 
158      for line in input_file: 
159          if line[0]!='>': 
160              line=line.rstrip() 
161              seq+=line 
162      output_file.write(str(nucleotide_dist_seq(seq))) 
163      output_file.write('\n') 
164      output_file.close() 
165      input_file.close() 
 166  
 
167  
 
168  '''gets the number of missmatches between 2 sequences
 
169  input: orig sequence, decoy sequence ''' 
171      """
 
172      Calculates the hamming distances between two sequences.
 
173      @type  original: list
 
174      @param original: Nucleotide sequence from the reference.
 
175      @type  artificial: list
 
176      @param artificial: Nucleotide sequence from the artificial reference.
 
177      """ 
178      hamming = 0 
179      not_hamming=0 
180      for i in range(0,len(original)): 
181          if (original[i]!=artificial[i]): 
182              hamming +=1 
183              
 
184          else: 
185              not_hamming+=1 
186      print ("#hamming distance REF-ART\t"+ str(hamming)) 
187      print ("avg. distance:\t" + str(len(original)/float(hamming))) 
188      print("###########################\r\n") 
 189