#!/usr/bin/env python3 # ============================================================== # Tomas Bruna # Copyright 2019, Georgia Institute of Technology, USA # # Convert spaln output to gff with introns, start and stop codons. Starts are # reported only if the alignment starts with ATG and the protein start matches # the alignment start. Stops are reported only if the codon right after the last # aligned exon is a stop codon and protein end matches the alignment end. Spaln # output file is read from standard input. # ============================================================== import sys import argparse def getPair(row): return row[0] + "_" + row[1] def printIntron(row, prevRow, eScore): if float(prevRow[10].strip()) < eScore or float(row[10].strip()) < eScore: return spliceSites = row[14].strip().split()[5] print(row[1] + '\tSpaln\tIntron\t' + str(int(prevRow[9]) + 1) + '\t' + str(int(row[8]) - 1) + '\t.\t+\t.\tprot=' + row[0] + '; iScore=' + row[12].strip() + '; Llen=' + prevRow[3].strip() + '; LScore=' + prevRow[10].strip() + '; Lid=' + prevRow[2].strip() + '; Rlen=' + row[3].strip() + '; RScore=' + row[10].strip() + '; Rid=' + row[2].strip() + '; Ssites=' + spliceSites + ';') def printStart(row, eScore, geneFasta): protStart = int(row[6]) exonScore = float(row[10].strip()) genomeStart = int(row[8]) if exonScore > eScore and protStart == 1: with open(geneFasta) as gene: gene.readline() seq = gene.read().replace("\n", "") firstCodon = seq[genomeStart - 1:genomeStart + 2] if firstCodon == "ATG": print(row[1] + '\tSpaln\tStart_codon\t' + str(genomeStart) + '\t' + str(genomeStart + 2) + '\t.\t+\t0\tprot=' + row[0] + '; eScore=' + row[10].strip() + '; elen=' + row[3].strip() + '; eid=' + row[2].strip() + ';') def printStop(row, eScore, geneFasta, protFasta): protEnd = int(row[7]) exonScore = float(row[10].strip()) genomeEnd = int(row[9]) protLength = 0 with open(protFasta) as prot: prot.readline() prot = prot.read().replace("\n", "") protLength = len(prot) if exonScore > eScore and protLength - protEnd < 5: with open(geneFasta) as gene: gene.readline() seq = gene.read().replace("\n", "") if genomeEnd + 3 > len(seq): return stopCodon = seq[genomeEnd:genomeEnd + 3] if stopCodon == "TAA" or stopCodon == "TAG" or stopCodon == "TGA": print(row[1] + '\tSpaln\tStop_codon\t' + str(genomeEnd + 1) + '\t' + str(genomeEnd + 3) + '\t.\t+\t0\tprot=' + row[0] + '; eScore=' + row[10].strip() + '; elen=' + row[3].strip() + '; eid=' + row[2].strip() + ';') def main(): args = parseCmd() prevPair = "" prevRow = "" for line in sys.stdin: row = line.split("\t") if row[0][0] == '#' or row[0][0] == '@': continue if int(row[9]) - int(row[8]) < 0: continue pair = getPair(row) if (pair == prevPair): printIntron(row, prevRow, args.intronScore) else: printStart(row, args.startScore, args.gene) prevPair = pair prevRow = row if (prevRow != ""): printStop(prevRow, args.stopScore, args.gene, args.prot) def parseCmd(): parser = argparse.ArgumentParser(description='Convert spaln output to gff with introns, \ start and stop codons. Starts are reported only if the \ alignment starts with ATG and the protein start matches \ the alignment start. Stops are reported only if the codon \ right after the last aligned exon is a stop codon and protein \ end matches the alignment end. Spaln output file is read \ from standard input.') parser.add_argument('--intronScore', type=float, required=True, help='Minimum score of exons downstream and upstream of \ intron fot intron to be reported') parser.add_argument('--startScore', type=float, required=True, help='Minimum score of a first aligned exon for a start \ to be reported') parser.add_argument('--stopScore', type=float, required=True, help='Minimum score of last aligned exon for a stop \ to be reported') parser.add_argument('--gene', type=str, required=True, help='Fasta file with genomic sequence used for alignment.') parser.add_argument('--prot', type=str, required=True, help='Fasta file with protein sequence used for alignment.') args = parser.parse_args() return args if __name__ == '__main__': main()