FASTA Reader in Python

A very simple reader/writer for FASTA encoded DNA sequences.

# -*- coding: utf-8 -*-
# Copyright  2006 INESC/ID
# Written by Luís Pedro Coelho 
#
# Licence: MIT Licence:
#
#   Permission is hereby granted, free of charge, to any person obtaining a copy of
#   this software and associated documentation files (the "Software"), to deal in
#   the Software without restriction, including without limitation the rights to
#   use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
#   of the Software, and to permit persons to whom the Software is furnished to do
#   so, subject to the following conditions:
#   
#   The above copyright notice and this permission notice shall be included in all
#   copies or substantial portions of the Software.
#
#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
#   FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
#   DEALINGS IN THE SOFTWARE.

class fasta_sequence:
    """
        FASTA Sequence with a header

        Two members: header and seq        
    """
    __slots__ = [ 'seq', 'header']
    def __init__(self, h, s):
        self.header=h
        self.seq=s


def fasta_read(input):
    """ 
        fasta_read(input):

        @param input can be either a file or the name of a file.

        Returns a list of fasta_sequence objects with all the sequences in the file.
        Comments (lines starting with ';') are ignored.
    """
    if type(input) == str:
        if input.endswith('.gz'):
            import gzip
            input=gzip.GzipFile(input)
        else:
            input=file(input)
    results = []
    header = ''
    seq_items = []
    first = True
    for line in input:
        if line[0] == ';':
            continue # comment
        elif line[0] == '>':
            if not first:
                seq= "".join(seq_items)
                results.append(fasta_sequence(header,seq))
                seq_items = []
            header = line[1:-1] # Eat '>' and '\n'
            first = False
        else:
            seq_items.append(line[:-1])
    if len(seq_items) > 0:
        seq = "".join(seq_items)
        results.append(fasta_sequence(header,seq))
    return results


def fasta_write(output,s):
    """
        fasta_write(output, sequence[s])

        @param output either a file (opened for writing) or a filename
        @param sequence it can be either a fasta_sequence or a list of fasta_sequence objects

        Writes the sequence(s) into the file in FASTA Format
    """
    line_width=70
    if type(output) == str:
        if output.endswith('.gz'):
            import gzip
            output=gzip.GzipFile(output,'w')
        else:
            output=file(output,'w')
    if type(s) == list:
        for ss in s:
            fasta_write(output,ss)
    else: 
        output.write("> %s\n" % s.header)
        for i in xrange(0,len(s.seq),line_width):
            output.write("%s\n" % s.seq[i:i+line_width])


def rfasta_write(output,seqs):
    """
        Restricted FASTA

        This format (used by BioPropector) is just FASTA with the whole sequence on one line.
    """
    if type(output) == str: output=file(output,'w')
    for s in seqs:
        output.write("> %s\n" % s.header)
        output.write("%s\n" % s.seq)