r - alignment of sequences -
i want pairwise alignment uniprot , pdb sequences. have input file containing uniprot , pdb ids this.
pdb id uniprot id 1dbh q07889 1e43 p00692 1f1s q53591 first, need read each line in input file 2) retrieve pdb , uniprot sequences pdb.fasta , uniprot.fasta files 3) alignment , calculate sequence identity.
usually, utilize next programme pairwise alignment , seq.identity calculation.
library("seqinr") seq1 <- "mdekrraqhneverrrrdkinnwivqlskiipdssmestksgqskggilskasdyiqelrqsnhr" seq2<- "mkgqqktaeteegtvqiqegavatgedptsvaiasiqsaatfpdpnvkyvfrtenggqvm" library(biostrings) globalalign<- pairwisealignment(seq1, seq2) pid(globalalign, type = "pid3") i need print output this
pdbid uniprotid seq.identity 1dbh q07889 99 1e43 p00692 80 1f1s q53591 56 how can alter above code ? help appreciated!
'
this code looking for:
class test(): def get_seq(self, pdb,fasta_file): # sequences bio.pdb.pdbparser import pdbparser bio import seqio aa = {'arg':'r','his':'h','lys':'k','asp':'d','glu':'e','ser':'s','thr':'t','asn':'n','gln':'q','cys':'c','sec':'u','gly':'g','pro':'p','ala':'a','ile':'i','leu':'l','met':'m','phe':'f','trp':'w','tyr':'y','val':'v'} p=pdbparser(permissive=1) structure_id="%s" % pdb[:-4] structure=p.get_structure(structure_id, pdb) residues = structure.get_residues() seq_pdb = '' res in residues: res = res.get_resname() if res in aa: seq_pdb = seq_pdb+aa[res] handle = open(fasta_file, "ru") record in seqio.parse(handle, "fasta") : seq_fasta = record.seq handle.close() self.seq_aln(seq_pdb,seq_fasta) def seq_aln(self,seq1,seq2): # align sequences bio import pairwise2 bio.subsmat import matrixinfo matlist matrix = matlist.blosum62 gap_open = -10 gap_extend = -0.5 alns = pairwise2.align.globalds(seq1, seq2, matrix, gap_open, gap_extend) top_aln = alns[0] aln_seq1, aln_seq2, score, begin, end = top_aln open('aln.fasta', 'w') outfile: outfile.write('> pdb_seq\n'+str(aln_seq1)+'\n> uniprot_seq\n'+str(aln_seq2)) print aln_seq1+'\n'+aln_seq2 self.seq_id('aln.fasta') def seq_id(self,aln_fasta): # sequence id import string bio import alignio input_handle = open("aln.fasta", "ru") alignment = alignio.read(input_handle, "fasta") j=0 # counts positions in first sequence i=0 # counts identity hits record in alignment: #print record amino_acid in record.seq: if amino_acid == '-': pass else: if amino_acid == alignment[0].seq[j]: += 1 j += 1 j = 0 seq = str(record.seq) gap_strip = seq.replace('-', '') percent = 100*i/len(gap_strip) print record.id+' '+str(percent) i=0 = test() a.get_seq('1dbh.pdb','q07889.fasta') this outputs:
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------eqtyydlvkaf-aeirqyirelnliikvfrepfvsnsklfsandvenifsrivdihelsvkllghiedtve-tdegsphplvgscfedlaeelafdpyesyardilrpgfhdrflsqlskpgaalylqsigegfkeavqyvlprlllapvyhclhyfellkqleeksedqedkeclkqaitallnvqsg-ekicskslakrrlsesa-------------aikk-neiqknidgwegkdigqccnefi-egtltrvgakherhiflfdgl-iccksnhgqprlpgasnaeyrlkekff-rkvqindkddtneykhafeiilkdensvifsaksaeeknnw-aalislqyrstl--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- mqaqqlpyeffseenapkwrgllvpalkkvqgqvhptlesnddalqyveelilqllnmlcqaqprsasdveervqksfphpidkwaiadaqsaiekrkrrnplslpvekihpllkevlgykidhqvsvyivavleyisadilklvgnyvrnirhyeitkqdikvamcadkvlmdmfhqdvedinilsltdeepstsgeqtyydlvkafmaeirqyirelnliikvfrepfvsnsklfsandvenifsrivdihelsvkllghiedtvemtdegsphplvgscfedlaeelafdpyesyardilrpgfhdrflsqlskpgaalylqsigegfkeavqyvlprlllapvyhclhyfellkqleeksedqedkeclkqaitallnvqsgmekicskslakrrlsesacrfysqqmkgkqlaikkmneiqknidgwegkdigqccnefimegtltrvgakherhiflfdglmiccksnhgqprlpgasnaeyrlkekffmrkvqindkddtneykhafeiilkdensvifsaksaeeknnwmaalislqyrstlermldvtmlqeekeeqmrlpsadvyrfaepdseeniifeenmqpkagipiikagtviklierltyhmyadpnfvrtflttyrsfckpqellsliierfeipepepteadriaiengdqplsaelkrfrkeyiqpvqlrvlnvcrhwvehhfydferdayllqrmeefigtvrgkamkkwvesitkiiqrkkiardngpghnitfqsspptvewhisrpghietfdlltlhpieiarqltllesdlyravqpselvgsvwtkedkeinspnllkmirhttnltlwfekcivetenleervavvsriieilqvfqelnnfngvlevvsamnsspvyrldhtfeqipsrqkkileeahelsedhykkylaklrsinppcvpffgiyltnilkteegnpevlkrhgkelinfskrrkvaeitgeiqqyqnqpyclrvesdikrffenlnpmgnsmekeftdylfnksleieprnpkplprfpkkysyplkspgvrpsnprpgtmrhptplqqeprkisysripesetestasapnsprtpltpppasgassttdvcsvfdsdhsspfhssndtvfiqvtlphgprsasvssisltkgtdevpvpppvpprrrpesapaesspskimskhldsppaipprqptskaysprysisdrtsisdppesppllpprepvrtpdvfsssplhlqppplgkksdhgnaffpnspspftppppqtpsphgtrrhlpsppltqevdlhsiagppvpprqstsqhipklppktykrehthpsmhrdgppllenahss pdb_seq 100 # pdb have 100% identity uniprot_seq 24 # pdb sequence has 24% identity uniprot sequence for work on input file, need set a.get_seq() in loop inputs text file.
edit:
replace seq_id function one:
def seq_id(self,aln_fasta): import string bio import alignio bio import seqio record_iterator = seqio.parse(aln_fasta, "fasta") first_record = record_iterator.next() print '%s has length of %d' % (first_record.id, len(str(first_record.seq).replace('-',''))) second_record = record_iterator.next() print '%s has length of %d' % (second_record.id, len(str(second_record.seq).replace('-',''))) lengths = [len(str(first_record.seq).replace('-','')), len(str(second_record.seq).replace('-',''))] if lengths.index(min(lengths)) == 0: # if both sequences have same length pdb sequence taken shortest print 'pdb sequence has shortest length' else: print 'uniport sequence has shortes length' idenities = 0 i,v in enumerate(first_record.seq): if v == '-': pass #print i,v, second_record.seq[i] if v == second_record.seq[i]: idenities +=1 #print i,v, second_record.seq[i], idenities print 'sequence idenity = %.2f percent' % (100.0*(idenities/min(lengths))) to pass arguments class use:
with open('input_file.txt', 'r') infile: next(infile) next(infile) # going input file line in infile: line = line.split() a.get_seq(segs[0]+'.pdb',segs[1]+'.fasta') r biopython bioperl
No comments:
Post a Comment