#!/bin/bash

explanation='
add Trinotate annotation
'
inputdef='
input_1::contig file:*.fasta,*.fa,*.fsa,*.fna
input_2::gene2isoform map file:*
'
optiondef='
opt_c:cpu threads:8
opt_m:memory limit (GB):64
'
runcmd="$0 -c #opt_c# -m #opt_m# #input_1# #input_2#"

export IM_TRINOTATE="c2997108/centos7:1-trinity_2.8.5-kallisto_0.46.0-blast_2.9.0-trinotate-3.1.1-R_3-kegg_2"
export IM_BLASTN="c2997108/centos7:1-blast_2.9.0-nt_20190618-13"
export IM_MERGE="c2997108/centos6:3"

source $(dirname `readlink -f $0 || echo $0`)/common.sh

set -eux
set -o pipefail

fasta=$input_1
gene2trans=$input_2


DO_TRINOTATE /usr/local/TransDecoder-TransDecoder-v5.5.0/TransDecoder.LongOrfs -t $fasta
DO_TRINOTATE /usr/local/TransDecoder-TransDecoder-v5.5.0/TransDecoder.Predict -t $fasta --cpu $N_CPU

transfasta=`basename $fasta`

echo '
 '$ENV_TRINOTATE' blastx -db /usr/local/Trinotate-Trinotate-v3.1.1/db/uniprot_sprot.pep -query '$fasta' -num_threads '$N_CPU' -max_target_seqs 1 -outfmt 6 -evalue 1e-5 > swissprot.blastx.outfmt6
 '$ENV_TRINOTATE' blastp -query '$transfasta'.transdecoder.pep -db /usr/local/Trinotate-Trinotate-v3.1.1/db/uniprot_sprot.pep -num_threads '$N_CPU' -max_target_seqs 1 -outfmt 6 -evalue 1e-5 > swissprot.blastp.outfmt6
 '$ENV_TRINOTATE' hmmscan --cpu '$N_CPU' --domtblout TrinotatePFAM.out /usr/local/Trinotate-Trinotate-v3.1.1/db/Pfam-A.hmm '$transfasta'.transdecoder.pep > pfam.log
 '$ENV_BLASTN' blastn -db /usr/local/db/nt.2019-06-18/nt -query '$fasta' -num_threads '$N_CPU' -max_target_seqs 1 -outfmt 6 | '$ENV_MERGE' myuniq.pl 1 > '$fasta'.blastn
'|tail -n+2|head -n-1 |DOPARALLEL

WAITPARALLEL

echo '
 '$ENV_TRINOTATE' /usr/local/signalp-4.1/signalp -f short -n signalp.out '$transfasta'.transdecoder.pep > sigP.log || true
 '$ENV_TRINOTATE' /usr/local/tmhmm-2.0c/bin/tmhmm --short < '$transfasta'.transdecoder.pep > tmhmm.out || true
 '$ENV_TRINOTATE' /usr/local/Trinotate-Trinotate-v3.1.1/util/rnammer_support/RnammerTranscriptome.pl --transcriptome '$fasta' --path_to_rnammer /usr/local/rnammer-1.2/rnammer || true
'|tail -n+2|head -n-1 |DOPARALLELONE

WAITPARALLEL

cat << EOF > run-trinotate-import.sh
 cp /usr/local/Trinotate-Trinotate-v3.1.1/db/20190618.sqlite Trinotate.sqlite
 /usr/local/Trinotate-Trinotate-v3.1.1/Trinotate Trinotate.sqlite init --gene_trans_map $gene2trans --transcript_fasta $fasta --transdecoder_pep $transfasta.transdecoder.pep
 /usr/local/Trinotate-Trinotate-v3.1.1/Trinotate Trinotate.sqlite LOAD_swissprot_blastx swissprot.blastx.outfmt6
 /usr/local/Trinotate-Trinotate-v3.1.1/Trinotate Trinotate.sqlite LOAD_swissprot_blastp swissprot.blastp.outfmt6
 /usr/local/Trinotate-Trinotate-v3.1.1/Trinotate Trinotate.sqlite LOAD_pfam TrinotatePFAM.out
 /usr/local/Trinotate-Trinotate-v3.1.1/Trinotate Trinotate.sqlite LOAD_rnammer $fasta.rnammer.gff
 /usr/local/Trinotate-Trinotate-v3.1.1/Trinotate Trinotate.sqlite LOAD_tmhmm tmhmm.out
 /usr/local/Trinotate-Trinotate-v3.1.1/Trinotate Trinotate.sqlite LOAD_signalp signalp.out
 /usr/local/Trinotate-Trinotate-v3.1.1/Trinotate Trinotate.sqlite report --incl_pep --incl_trans > Trinotate.xls
 /usr/local/Trinotate-Trinotate-v3.1.1/util/extract_GO_assignments_from_Trinotate_xls.pl --Trinotate_xls Trinotate.xls -T -I > Trinotate.xls.gene_ontology
 #/usr/local/Trinotate-Trinotate-v3.1.1/util/annotation_importer/import_transcript_names.pl Trinotate.sqlite Trinotate.xls

#Trinotate Trinotate.sqlite report --incl_pep --incl_trans > trinotate_annotation_report.tsv0
EOF
DO_TRINOTATE bash run-trinotate-import.sh


cat << 'EOF' > run-kegg.sh
db=/usr/local/db/kegg-2017-10-22

awk -F'\t' 'FILENAME==ARGV[1]{koname[$1]=$2} FILENAME==ARGV[2]{mdname[$1]=$2} FILENAME==ARGV[3]{pathname[$1]=$2}
 FILENAME==ARGV[4]{split($2,arr,","); for(i in arr){mdid[arr[i]][$1]=1}}
 FILENAME==ARGV[5]{split($2,arr,","); for(i in arr){pathid[arr[i]][$1]=1}}
 FILENAME==ARGV[6]{
  if(FNR==1){print $0"\tKEGG ORTHOLOGY ID\tKEGG ORTHOLOGY NAME\tKEGG MODULE NAME\tKEGG PATHWAY NAME"}
  else{
   kid=""; kname=""; delete mid; mname=""; delete pid; pname="";
   split($12,arr,"`"); for(i in arr){if(arr[i]~"^KO:"){kid=kid","substr(arr[i],4)}}; n=split(kid,arr,",");
   for(i=2;i<=n;i++){kname=kname","koname[arr[i]]; if(length(mdid[arr[i]])!=0){for(j in mdid[arr[i]]){mid[j]=1}}}; kname=substr(kname,2);
   if(length(mid[i])!=0){for(i in mid){mname=mname","mdname[i]; if(length(pathid[i])!=0){for(j in pathid[i]){pid[j]=1}}}}; mname=substr(mname,2);
   if(length(pid[i])!=0){for(i in pid){pname=pname","pathname[i]}}; pname=substr(pname,2);
   print $0"\t"substr(kid,2)"\t"kname"\t"mname"\t"pname
  }
 }' $db/ko.ren $db/md.ren $db/path.ren $db/mdget.kolist $db/pathget.mdlist Trinotate.xls > Trinotate.xls2
EOF


DO_TRINOTATE bash run-kegg.sh
DO_BLASTN zcat /usr/local/db/nt.2019-06-18/nt.name.gz | DO_TRINOTATE awk -F'\t' 'FILENAME==ARGV[1]{id[$1]=$2; sim[$1]=$3; len[$1]=$4; eval[$1]=$11; flagname[$2]=1} FILENAME==ARGV[2]{if(flagname[$1]==1){name[$1]=$2}else{delete flagname[$1]}} FILENAME==ARGV[3]{if(FNR==1){print $0"\tNCBI NT top hit\te-value\tsimilarity\thit length"}else{if(name[id[$2]]==""){name[id[$2]]=id[$2]}; print $0"\t"name[id[$2]]"\t"eval[$2]"\t"sim[$2]"\t"len[$2]}}' $fasta.blastn /dev/stdin Trinotate.xls2 > Trinotate.xls3

DO_TRINOTATE awk -F'\t' '{ORS=""; print $2"\t"$1; for(i=3;i<=NF;i++){print "\t"$i}; print "\n"}' Trinotate.xls3 > Trinotate.xls3.isoform



post_processing


#<option detail>
#</option detail>

