1  
 
  2  """
 
  3  Created 2012
 
  4  core Script for the generation of the artificial reference genome
 
  5  
 
  6  The functions purpose is to to go through a list of positions and find balanced mutations
 
  7  which fulfill the demands on the artificial reference genome. Once a initial start positions is
 
  8  randomly selected all possible triplets with hamming distance 1 are generated and looked up 
 
  9  in a dictionary which contains all triplet positions in the input genome. If a suitable partner
 
 10  is found for the initial mutation the next start positions is chosen randomly. Else: try all other
 
 11  triplets with hamming distance 1 until no one is left. This process can be accelerated by allowing
 
 12  unbalanced mutations, but this will cause differences in the NUC/AA distribution and the AA neighborhood.
 
 13  
 
 14  
 
 15  @author: Sven Giese
 
 16  """ 
 17  
 
 18  import random as r 
 19  import Prep as INI 
 20  
 
 21  
 
 22  
 
 24      """
 
 25      Returns a random mutation for a given AA and its Codon(DNA). The mutation is done in a way which supports the equilibrium 
 
 26      of the nucleotide distribution by only regarding hamming distance=1 Codons as possible mutations 
 
 27  
 
 28      @type  AA: string
 
 29      @param AA: Single AA.
 
 30      @type  Codon: string
 
 31      @param Codon: 3-letter dna code.
 
 32      @rtype:   list,list
 
 33      @return:  A list of all valid mutations (triplet) and the coresponding AA.
 
 34      """ 
 35      temp_mutationlist = [] 
 36      '''create a list of possible triplets within hamming distance 1 ''' 
 37      for item in INI.genetic_code.keys(): 
 38          isvalid = INI.isvalidtriplet(item,Codon) 
 39          ''' Hamming distance 1, AA is not equal to the given AA,forbid mutation to stopcodon ''' 
 40          if (isvalid == True and AA !=INI.genetic_code[item] and INI.genetic_code[item]!="*"): 
 41              temp_mutationlist.append(item) 
 42      
 
 43      
 
 44      aalist = [] 
 45      
 
 46      for item in temp_mutationlist: 
 47          if (item in INI.genetic_code): 
 48              aalist.append(INI.genetic_code[item]) 
 49          else: 
 50              aalist.append("n") 
 51              
 
 52      return(temp_mutationlist,aalist) 
  53  
 
 54  
 
 56      """
 
 57      Given two triplets, returns the differences between them plus the position 
 
 58  
 
 59      @type  triplet_old: string
 
 60      @param triplet_old: AA triplet.
 
 61      @type  triplet_new: string
 
 62      @param triplet_new: AA triplet.
 
 63      @rtype:   Char,Char,int
 
 64      @return:  The new aminoacid, the old aminoacid and the position. 
 
 65      """ 
 66      for i in range(0,3): 
 67          if (triplet_new[i]!=triplet_old[i]): 
 68              
 
 69              return (triplet_new[i],triplet_old[i],i) 
  70          
 
 71        
 
 73      """
 
 74      Checks if a position is valid for mutation. It queries all neighboring positions (iprime +-distance) to check whether there already was a mutation in pdic
 
 75  
 
 76      @type  pdic: dictionary
 
 77      @param pdic: Diciontary containing mutations and start/ stop codons..
 
 78      @type  iprime: int
 
 79      @param iprime: Position of the prospective mutation (DNA level)
 
 80      @type  distance: int
 
 81      @param distance: User defined parameter which limits the distance between two mutations.
 
 82      @rtype:   Bool
 
 83      @return: Boolean which decides if the position is valid (1= yes,0 = no) 
 
 84      """ 
 85      
 
 86      
 
 87      distance = distance-2 
 88      
 
 89      istforbidden = 0 
 90      for o in range(-distance,distance+2,1): 
 91          if (iprime+o in pdic): 
 92              
 
 93              
 
 94              if((pdic[iprime+o]=="E") or (pdic[iprime+o]=="S")): 
 95                  if((o >3) or (o <-3)): 
 96                      pass 
 97                  else: 
 98                      istforbidden = 1 
 99                      break 
100              else: 
101                  istforbidden = 1 
102                  break 
103      else: 
104              pass 
105                  
 
106      return(istforbidden) 
 107  
 
108  
 
109  
 
110  
 
111 -def mutate_random(DNA,AminoAcid,distance,pdic,rev,header,Random,outputpath): 
 112          """
 
113          Mutates a given DNA(AminoAcid) Genomesequence on several positions (distance based on DISTANCE var. If one mutation is done
 
114          a compareable Triplet is searched to "reverse" the  changes made in AA distribution, N distribution, AA neighborhood
 
115      
 
116          @type  DNA: list
 
117          @param DNA: DNA sequence of the reference genome.
 
118          @type  AminoAcid: list
 
119          @param AminoAcid: AA sequence of the reference genome.
 
120          @type  rev: Bool
 
121          @param rev: Boolean which decides if unbalanced mutations are allowed (only initial mutation is performed)
 
122          @type  pdic: dictionary
 
123          @param pdic: Diciontary containing mutations and start/ stop codons..
 
124          @type  header: string
 
125          @param header: Header for the resulting artificial reference file (fasta format).
 
126          @type  Random: Bool
 
127          @param Random: Boolean for choosing on of the mutation modes (linear = 0,random = 1)
 
128          @type  distance: int
 
129          @param distance: User defined parameter which limits the distance between two mutations.
 
130          @rtype:   list
 
131          @return: Artificial reference genome sequence.
 
132          """ 
133          
 
134          start = []  
135          both = []   
136          fobj2= open(outputpath+header+"_CompleteLog.txt","a") 
137          fobj2.write("BalancedMutation"+"\t"+"NewAA" + "\t" + "OldAA"+"\t"+"NewAAPos"+"\t"+"OldAAPos" +"\t"+ "NewDNA"+"\t"+ "OldDNA"+ "\t"+"NewDNAPos"+"\t"+"OldDNAPos"+"\n") 
138          fobj2.close() 
139          
 
140          
 
141          
 
142          samplespace = [] 
143          for i in range (2,len(AminoAcid),distance/3): 
144              samplespace.append(i) 
145          
 
146          
 
147          
 
148          if (Random ==1): 
149              r.shuffle(samplespace) 
150          else: 
151              pass 
152        
 
153          dna_list = list(DNA) 
154          AminoAcid_list = list(AminoAcid) 
155         
 
156          '''the lookup dictionary for the aa triplets ''' 
157          lookup_dic = INI.createdic(AminoAcid) 
158  
 
159          
 
160          gotit=False 
161          
 
162          succ_counter = 0 
163          fail_counter = 0  
164          skip = 0 
165         
 
166          ''' Main loop over the AminoAcid''' 
167          for i in samplespace: 
168              ''' no triplet left --> break ''' 
169              if(i+2 >len(AminoAcid)): 
170                  print("\t(finished...exceeded length of AA)") 
171                  continue 
172              
 
173              ''' AA which is going to be mutated''' 
174              AA = AminoAcid_list[i] 
175          
 
176              '''index for dna : i*3 --> AminoAcid --> DNA
 
177              #not i*3+3 because i starts at AA 2 since we need a right and left neighbor''' 
178              iprime = i*3 
179              
 
180              '''AA and corresponding DNA triplet for the middle AA ''' 
181              AA_triplet= AminoAcid_list[i-1]+AminoAcid_list[i]+AminoAcid_list[i+1] 
182              DNA_triplet = DNA[iprime:iprime+3] 
183  
 
184              
 
185              mutationsliste,aaliste = getMutation(AA,DNA_triplet) 
186              
 
187              
 
188              
 
189              val = isvalidposition(pdic, iprime, distance) 
190              if (val ==1): 
191                  skip+=1 
192                  fobj2= open(outputpath+header+"_CompleteLog.txt","a") 
193                  fobj2.write(str(0)+"\t"+new_AA_triplet + "\t" + "' '"+"\t"+str(i)+"\t"+"' '" +"\t"+ new_triplet+"\t"+ "' '"+ "\t"+str(iprime+position)+"\t"+"'(skipped)'"+"\n") 
194                  fobj2.close() 
195                  continue 
196                      
 
197              else: 
198                  pass 
199              
 
200  
 
201              for q,item in enumerate(mutationsliste): 
202                  
 
203                  if gotit==True: 
204                      break 
205                  else: 
206                      pass 
207                  
 
208                  ''' old and new variables for before/after the mutation ''' 
209                  new_triplet                 = mutationsliste[q] 
210                  new_AA                      = aaliste[q] 
211                  new_N,old_N,position        = getdifference(DNA_triplet,new_triplet) 
212                  new_AA_triplet              = AA_triplet[0]+new_AA+AA_triplet[2] 
213                  tempdic = pdic 
214                  tempdic[iprime+position]="M" 
215                  
 
216                  if (new_AA_triplet in lookup_dic): 
217                      '''templist--> contains all starting positions of the "new_AA_triplet" which we want to substitute back ''' 
218                      templist = lookup_dic[new_AA_triplet] 
219                      
 
220                      
 
221                      
 
222                      tempposition = [iprime+position,"M"] 
223                      for l in range(0,len(templist)): 
224                          posi = templist[l] 
225                          
 
226                          ''' suitable dna position found? ''' 
227                          if (new_triplet == dna_list[posi*3+3]+dna_list[posi*3+3+1]+dna_list[posi*3+3+2]): 
228                              val = isvalidposition(tempdic, posi*3+3+position, distance) 
229                              
 
230                              if (val ==1): 
231                                  skip+=1 
232                                  continue 
233                              else: 
234                                  pass 
235                              
 
236                              '''back substitution & do subs on 1st position''' 
237                              pdic[posi*3+3+position]="R" 
238                              dna_list[posi*3+3+position]= old_N 
239                              
 
240                              pdic[iprime+position]="M" 
241                              dna_list[iprime+position]= new_N 
242                              
 
243                              AminoAcid_list[i]= new_AA 
244                              AminoAcid_list[posi+1]= AA 
245                              
 
246                              gotit = True 
247                              succ_counter+=1 
248                              
 
249                              lookup_dic[new_AA_triplet].remove(posi) 
250                              
 
251                              '''writing the log file ''' 
252                              fobj= open(outputpath+header+"_CompleteLog.txt","a") 
253                              fobj.write(str(1)+"\t"+AA_triplet + "\t" + new_AA_triplet+"\t"+str(i)+"\t"+str(posi) +"\t"+ DNA_triplet+"\t"+ str(new_triplet)+ "\t"+str(iprime+position)+"\t"+str(posi*3+3+position)+"\n") 
254                              fobj.close() 
255                              
 
256                              
 
257                              start.append(iprime+position) 
258                              both.extend([iprime+position,posi*3+3+position]) 
259                              break 
260                  
 
261                  
 
262                  else: 
263                      continue 
264                   
 
265              
 
266              if (gotit==False): 
267                  fobj2= open(outputpath+header+"_CompleteLog.txt","a") 
268                  fobj2.write(str(0)+"\t"+new_AA_triplet + "\t" + "' '"+"\t"+str(i)+"\t"+"' '" +"\t"+ new_triplet+"\t"+ "' '"+ "\t"+str(iprime+position)+"\t"+"'(tried)'"+"\n") 
269                  fobj2.close() 
270                  fail_counter+=1 
271                  
 
272                  if (rev==0): 
273                      pdic[iprime+position]="M" 
274                      dna_list[iprime+position]= new_N 
275                      AminoAcid_list[i]= new_AA 
276                      start.append(iprime+position) 
277                      both.extend([iprime+position])         
278              elif (gotit==True): 
279                      gotit = False 
280              
 
281          
 
282          print("\r\n########Some stats:########") 
283          print("DNA length:\t" + str(len(DNA))) 
284          print("max substitutions:\t" + str(len(DNA)/distance)) 
285          print("#Balanced Mutations:\t" + str(succ_counter)) 
286          
 
287          
 
288          return ("".join(dna_list)) 
 289