1  
 
  2  
 
  3  
 
  4  """
 
  5  FindOrfs, Created 2012
 
  6  
 
  7  Script to detect start and stop codon of open reading frames in a dna sequence. Note that the algorithm is only rudimentary and does not respect intron-exon structures.
 
  8  
 
  9  """ 
 10  
 
 11  
 
 13      """
 
 14      Get orf positions (forward/backward) and return them in a dictionary
 
 15  
 
 16      @type  sequence: string
 
 17      @param sequence: nucleotide sequence
 
 18      @type  file_ORF: string
 
 19      @param file_ORF: outputfile
 
 20      @type  pdic: dictionary
 
 21      @param pdic: Used to start start / end positions of ORFs.
 
 22      @rtype:   dictionary
 
 23      @return:  Stored start start / end positions of ORFs
 
 24      """ 
 25  
 
 26      
 
 27      START,STOP,STARTrev,STOPrev = find_orfs(sequence,pdic) 
 28      t=open(file_ORF,'a') 
 29      print "\t search forward orfs..." 
 30      orf_counter = 0 
 31      orf_name = "forw" 
 32      for i in START: 
 33          for j in STOP: 
 34              
 
 35              
 
 36              
 
 37              
 
 38              if i<j and (j+3-i)>149 and (j+3-i)<103050 and (j+3-i)%3==0: 
 39                  orf_counter+=1 
 40                  
 
 41                  t.write(">"+orf_name+" " + str(orf_counter)) 
 42                  t.write('\n') 
 43                  t.write(sequence[i:j+3]) 
 44                  t.write('\n') 
 45                  pdic[i]="S"  
 46                  pdic[j]="E"  
 47                  
 
 48      temp = orf_counter 
 49      
 
 50      orf_counter = 0 
 51      orf_name = "rev" 
 52      print "\t search backward orfs..." 
 53      for i in STARTrev: 
 54          for j in STOPrev: 
 55              
 
 56              
 
 57              
 
 58              
 
 59              
 
 60              if j<i and (i+3-j)>150 and (j+3-i)<103050 and (j+3-i)%3==0: 
 61                  
 
 62                  orf_counter+=1 
 63                  t.write(">"+orf_name+" " + str(orf_counter)) 
 64                  t.write('\n') 
 65                  t.write(sequence[j:i+3][::-1]) 
 66                  t.write('\n') 
 67                  pdic[i]="E" 
 68                  pdic[j]="S" 
 69      print(str(temp+orf_counter)) 
 70      t.close() 
 71      
 
 72      return (pdic) 
  73  
 
 74  
 
 76      """
 
 77      return the index of the first position of codon in the dna sequence  
 
 78  
 
 79      @type  posLastStop: int
 
 80      @param posLastStop: Position of the last found stop codon.
 
 81      @type  sequence: string
 
 82      @param sequence: Nucleotide sequence.
 
 83      @type  codon: string
 
 84      @param codon: 3-letter DNA code.
 
 85      @rtype:   int
 
 86      @return:  The position of the stop codon in the nucleotide sequence.
 
 87      """ 
 88      try: 
 89          return(sequence.index(codon,posLastStop)) 
 90              
 
 91      except: 
 92          return(-1) 
  93      
 
 95      """
 
 96      function to identify open reading frames in a dna sequence, careful: intron exon structures are not respected!  
 
 97  
 
 98      @type  genomeSequence: string
 
 99      @param genomeSequence: Nucleotide sequence.
 
100      @type  pdic: dictionary
 
101      @param pdic: Used to store start / end positions of ORFs.
 
102      @rtype:   dictionary
 
103      @return:  Found  start / end positions in the sequence consindering only the ORFs.
 
104      """ 
105      
 
106      
 
107      
 
108      
 
109   
 
110      start =[] 
111      stop =[] 
112      
 
113      posLastATG = 0 
114      posLastStop = 3 
115      orfList = [] 
116      
 
117      
 
118      print("\t..find forward orfs") 
119      while True: 
120          foundNew = False 
121          
 
122          try: 
123              
 
124                  start.append(genomeSequence.index("ATG",posLastATG)) 
125                  
 
126                  posLastATG = start[-1]+1 
127                  foundNew = True 
128          except: 
129              pass 
130          
 
131          stopSub =[] 
132          stopcodons =["TAA","TGA","TAG"] 
133          for item in stopcodons: 
134              stopSub.append(findstop_help(posLastStop, genomeSequence, item)) 
135  
 
136          stopSub.sort() 
137          
 
138          if(stopSub[0] > -1): 
139              stop.append(stopSub[0]) 
140              posLastStop = stop[-1]+1 
141              foundNew = True 
142              
 
143          elif(stopSub[1] > -1): 
144              stop.append(stopSub[1]) 
145              posLastStop = stop[-1]+1  
146              foundNew = True; 
147              
 
148          elif(stopSub[2] > -1): 
149              stop.append(stopSub[2]);  
150              posLastStop = stop[-1]+1  
151              foundNew = True; 
152          
 
153          if(foundNew): 
154              pass 
155          else: 
156              break 
157          
 
158      
 
159      
 
160      
 
161          
 
162      startRev = [] 
163      stopRev = [] 
164  
 
165      posLastCAT = 3 
166      posLastStop_rev = 0 
167         
 
168      print("\t..find reverse orfs") 
169      while True: 
170              foundNew_rev  = False 
171              
 
172              try: 
173                  
 
174                      startRev.append(genomeSequence.index("CAT",posLastCAT)) 
175                      
 
176                      posLastCAT = startRev[-1]+1 
177                      foundNew_rev = True 
178              except: 
179                  pass 
180              
 
181              stopSub =[] 
182              stopcodons =["TTA","TCA","CTA"] 
183              for item in stopcodons: 
184                  stopSub.append(findstop_help(posLastStop_rev, genomeSequence, item)) 
185      
 
186              stopSub.sort() 
187              
 
188              if(stopSub[0] > -1): 
189                  stopRev.append(stopSub[0]) 
190                  posLastStop_rev = stopRev[-1]+1 
191                  foundNew_rev = True 
192                  
 
193              elif(stopSub[1] > -1): 
194                  stopRev.append(stopSub[1]) 
195                  posLastStop_rev = stopRev[-1]+1 
196                  foundNew_rev = True 
197                  
 
198                  
 
199              elif(stopSub[2] > -1): 
200                  stopRev.append(stopSub[2]) 
201                  posLastStop_rev = stopRev[-1]+1 
202                  foundNew_rev = True 
203                  
 
204              
 
205              if(foundNew_rev): 
206                  pass 
207              else: 
208                  break 
209      
 
210      print("START codons : "  + str(len(start))) 
211      print("STOP codons : "  +str(len(stop))) 
212      print("revSTART codons : "  +str(len(startRev))) 
213      print("revSTOP codons : "  +str(len(stopRev))) 
214      
 
215      
 
216      
 
217      
 
218      
 
219      removeList=[] 
220      print("\t..creating forward orfs") 
221      for stopPos in stop: 
222          foundPartner=False 
223          startPos = 0 
224          i = 0 
225          startPos = start[0] 
226          
 
227          while ((i<len(start)-1) and (start[i] <stopPos)): 
228                  startPos = start[i] 
229                  i+=1 
230                  if((stopPos+3-startPos)% 3 == 0): 
231                      
 
232                      if((foundPartner == False) and (stopPos+3)-startPos >149): 
233                          
 
234                          foundPartner = True 
235                          pdic[stopPos+3]="E" 
236                          pdic[startPos]="S" 
237                          orfList.append(startPos) 
238                          orfList.append(stopPos+3) 
239                          removeList.append(startPos) 
240                      else: 
241                          removeList.append(startPos) 
242          for item in removeList: 
243              
 
244              start.remove(item) 
245          removeList =[] 
246          
 
247       
 
248      
 
249      print("\t..creating reverse orfs") 
250      removeList_rev=[] 
251      l = len(stopRev)-1 
252      
 
253      for r in range(l,-1,-1): 
254          stopPos = stopRev[r] 
255          foundPartner=False 
256          
 
257          
 
258          
 
259          i = len(startRev)-1 
260         
 
261          
 
262          while((i >=0) and (startRev[i]>stopPos)): 
263              startPos=startRev[i] 
264              i -=1 
265              if((startPos+3-stopPos)%3 == 0): 
266                  
 
267                  if((foundPartner != True) and (startPos+3-stopPos > 149)): 
268                      
 
269                      pdic[stopPos]="S" 
270                      pdic[startPos+3]="E" 
271                      foundPartner = True 
272                      orfList.append(startPos) 
273                      orfList.append(stopPos+3) 
274                      removeList_rev.append(startPos) 
275                  else: 
276                      removeList_rev.append(startPos) 
277                      
 
278          for item in removeList_rev: 
279          
 
280              startRev.remove(item) 
281          removeList_rev =[] 
282          
 
283      return(pdic) 
 284