A very simple reader/writer for FASTA encoded DNA sequences.
# -*- coding: utf-8 -*- # Copyright 2006 INESC/ID # Written by Luís Pedro Coelho# # Licence: MIT Licence: # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in # the Software without restriction, including without limitation the rights to # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies # of the Software, and to permit persons to whom the Software is furnished to do # so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. class fasta_sequence: """ FASTA Sequence with a header Two members: header and seq """ __slots__ = [ 'seq', 'header'] def __init__(self, h, s): self.header=h self.seq=s def fasta_read(input): """ fasta_read(input): @param input can be either a file or the name of a file. Returns a list of fasta_sequence objects with all the sequences in the file. Comments (lines starting with ';') are ignored. """ if type(input) == str: if input.endswith('.gz'): import gzip input=gzip.GzipFile(input) else: input=file(input) results = [] header = '' seq_items = [] first = True for line in input: if line[0] == ';': continue # comment elif line[0] == '>': if not first: seq= "".join(seq_items) results.append(fasta_sequence(header,seq)) seq_items = [] header = line[1:-1] # Eat '>' and '\n' first = False else: seq_items.append(line[:-1]) if len(seq_items) > 0: seq = "".join(seq_items) results.append(fasta_sequence(header,seq)) return results def fasta_write(output,s): """ fasta_write(output, sequence[s]) @param output either a file (opened for writing) or a filename @param sequence it can be either a fasta_sequence or a list of fasta_sequence objects Writes the sequence(s) into the file in FASTA Format """ line_width=70 if type(output) == str: if output.endswith('.gz'): import gzip output=gzip.GzipFile(output,'w') else: output=file(output,'w') if type(s) == list: for ss in s: fasta_write(output,ss) else: output.write("> %s\n" % s.header) for i in xrange(0,len(s.seq),line_width): output.write("%s\n" % s.seq[i:i+line_width]) def rfasta_write(output,seqs): """ Restricted FASTA This format (used by BioPropector) is just FASTA with the whole sequence on one line. """ if type(output) == str: output=file(output,'w') for s in seqs: output.write("> %s\n" % s.header) output.write("%s\n" % s.seq)