#!/usr/bin/env python3
# ==============================================================
# Tomas Bruna
# Copyright 2019, Georgia Institute of Technology, USA
#
# Flag hints which were mapped from the best DIAMOND protein hit
# ==============================================================


import argparse
import csv
import re
import sys


def extractFeatureGff(text, feature):
    regex = feature + '=([^;]+);'
    return re.search(regex, text).groups()[0]


def loadTopPairs(diamondPairs):
    topPairs = set()
    allPairs = set()
    prevGene = ""
    for row in csv.reader(open(diamondPairs), delimiter='\t'):
        gene = row[0]
        allPairs.add(row[0] + "_" + row[1])
        if gene != prevGene:
            topPairs.add(row[0] + "_" + row[1])
        prevGene = gene
    return topPairs, allPairs


def flagHints(hints, topPairs, allPairs):
    for row in csv.reader(open(hints), delimiter='\t'):
        hintProt = extractFeatureGff(row[8], "prot")
        seedGene = extractFeatureGff(row[8], "seed_gene_id")
        key = seedGene + "_" + hintProt

        if key not in allPairs:
            sys.exit('error: Gene-protein pair "' + seedGene + "-" +
                     hintProt + '" present in the Spaln output was not found '
                     'in the file with DIAMOND gene-protein pairs. This issue '
                     'can be caused by the presence of special characters in '
                     'the fasta headers of input files. Please remove any '
                     'special characters and re-run ProtHint. '
                     'See https://github.com/gatech-genemark/ProtHint#input '
                     'for more details about the input format.')

        if key in topPairs:
            row[8] += " topProt=TRUE;"
        print("\t".join(row))


def main():
    args = parseCmd()
    topPairs, allPairs = loadTopPairs(args.diamondPairs)
    flagHints(args.hints, topPairs, allPairs)


def parseCmd():

    parser = argparse.ArgumentParser(description='Flag hints which were mapped \
                                     from the best DIAMOND protein hit')

    parser.add_argument('hints', metavar='hints.gff', type=str,
                        help='File with hints, source gene-protein pair \
                        is assumed to be in the 9th colum, labeled \
                        "seed_gene_id" and "prot", respectively.')

    parser.add_argument('diamondPairs', metavar='diamondPairs', type=str,
                        help='File with DIAMOND pairs. File should be sorted \
                        by query protein ID in column 1 and by target protein \
                        score (from best to worst) in column 2.')

    return parser.parse_args()


if __name__ == '__main__':
    main()