#!/usr/bin/env python3
# ==============================================================
# Tomas Bruna
# Copyright 2020, Georgia Institute of Technology, USA
#
# Compare files with old and new gene seeds. Output gene seeds which are unique
# in the new seed file and hints from previous iteration of ProtHint which
# correspond to seeds which are identical in both files (with gene ids updated
# to match the ids in the new seed file.
# ==============================================================


import argparse
import csv
import re
import tempfile
import subprocess
import os


def extractFeatureGtf(text, feature):
    regex = feature + ' "([^"]+)"'
    return re.search(regex, text).groups()[0]


def extractFeatureGff(text, feature):
    regex = feature + '=([^;]+);'
    return re.search(regex, text).groups()[0]


def readGff(gffFile):
    for row in csv.reader(open(gffFile), delimiter='\t'):
        pass


class Gene:

    def __init__(self, firstCds, geneId):
        self.CDSs = [firstCds]
        self.geneId = geneId
        self.signature = firstCds[0] + "_" + firstCds[3] + "_" + \
            firstCds[4] + "_" + firstCds[6] + "_" + firstCds[7]

    def addCDS(self, cds):
        self.CDSs.append(cds)
        self.signature += "-" + cds[0] + "_" + cds[3] + "_" + \
            cds[4] + "_" + cds[6] + "_" + cds[7]

    def print(self, out):
        for cds in self.CDSs:
            out.write("\t".join(cds) + "\n")


def loadGenes(gtf):
    sortedGtf = sortGenes(gtf)
    genes = {}
    for row in csv.reader(open(sortedGtf), delimiter='\t'):
        if row[2] != "CDS":
            continue
        geneId = extractFeatureGtf(row[8], "gene_id")
        if geneId not in genes:
            genes[geneId] = Gene(row, geneId)
        else:
            genes[geneId].addCDS(row)
    os.remove(sortedGtf)

    return genes


def sortGenes(gtf):
    sortedGtf = tempfile.NamedTemporaryFile(delete=False).name
    subprocess.call(["bash", "-c", "sort -k1,1 -k4,4n -k5,5n " + gtf +
                    " > " + sortedGtf])
    return sortedGtf


def findUniqueNewGenes(newGenes, oldGenes, outFile):
    oldGeneSignatures = {}

    # Hash old genes signature to make the search fast
    for gene in oldGenes:
        oldGeneSignatures[oldGenes[gene].signature] = oldGenes[gene].geneId

    # Mapping of old to new gene ids for seed genes which are identical
    old2new = {}

    out = open(outFile, 'w')
    for gene in newGenes:
        newSignature = newGenes[gene].signature
        if newSignature not in oldGeneSignatures:
            newGenes[gene].print(out)
        else:
            old2new[oldGeneSignatures[newSignature]] = newGenes[gene].geneId
    out.close()

    return old2new


def printHintsWithIdenticalSeeds(prevSpalnGff, old2new, outFile):
    out = open(outFile, 'w')
    for row in csv.reader(open(prevSpalnGff), delimiter='\t'):
        seedGeneId = extractFeatureGff(row[8], "seed_gene_id")
        if seedGeneId in old2new:
            oldString = "seed_gene_id=" + seedGeneId + ";"
            newString = "seed_gene_id=" + old2new[seedGeneId] + ";"
            row[8] = row[8].replace(oldString, newString)
            out.write("\t".join(row) + "\n")
    out.close()


def main():
    args = parseCmd()
    oldGenes = loadGenes(args.prevGeneSeeds)
    newGenes = loadGenes(args.geneSeeds)
    old2new = findUniqueNewGenes(newGenes, oldGenes, args.uniqueNewSeedsOut)
    printHintsWithIdenticalSeeds(args.prevSpalnGff, old2new,
                                 args.identicalSpalnGffOut)


def parseCmd():

    parser = argparse.ArgumentParser(description='Compare files with old and \
                                     new gene seeds. Output gene seeds which \
                                     are unique in the new seed file and hints\
                                     from previous iteration of ProtHint\
                                     which correspond to seeds which are \
                                     identical in both files (with gene ids\
                                     updated to match the ids in the new \
                                     seed file.')

    parser.add_argument('--geneSeeds', type=str, required=True,
                        help='New seed genes. One isoform per gene is assumed')

    parser.add_argument('--prevGeneSeeds', type=str, required=True,
                        help='Seed genes from previous iteration. One isoform\
                        per gene is assumed.')

    parser.add_argument('--prevSpalnGff', type=str, required=True,
                        help='Scored hints from previous iteration')

    parser.add_argument('--uniqueNewSeedsOut', type=str, required=True,
                        help='Seed genes which are unique in the new set')

    parser.add_argument('--identicalSpalnGffOut', type=str, required=True,
                        help='Scored hints from previous iteration \
                        corresponding to seed genes which are the same in \
                        --geneSeeds and --prevGeneSeeds files. Gene ids of \
                        the hints are update to match the ids in the new \
                        --geneSeeds file')

    return parser.parse_args()


if __name__ == '__main__':
    main()