#!/usr/bin/env perl

##########################################################################################
#   This file is part of Proteinortho.
#   (C) 2009/2010 Marcus Lechner
#
#   Proteinortho is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published
#   by the Free Software Foundation; either version 2, or (at your
#   option) any later version.
#
#   Proteinortho is distributed in the hope that it will be useful, but
#   WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#   General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with Proteinortho; see the file COPYING.  If not, write to the
#   Free Software Foundation, Inc., 59 Temple Place - Suite 330,
#   Boston, MA 02111-1307, USA.
##########################################################################################
##########################################################################################
# About
##########################################################################################
# Proteinortho
# input fasta files with proteins
# output matrix with orthologous proteins
#
# @authors Marcus Lechner, Clemens Elias Thoelken, Paul Klemm
# @email lechner\@staff.uni-marburg.de
# @company University of Maruburg
# @date 2017-04-18
#
##########################################################################################

##MARK_FOR_NEW_BLAST_ALGORITHM => FLAG for adding a new blast algorithm.

### ============================================================================
### MAN-PAGE
###

=head1 NAME

proteinortho - Orthology detection tool (lechner\@staff.uni-marburg.de)

=head1 SYNOPSIS

proteinortho6.pl [options] <fasta files (one for each species, at least 2)>

ALIAS

proteinortho [options] <fasta files>

=head1 DESCRIPTION

B<proteinortho> is a tool to detect orthologous genes within different species.  For doing so, it compares similarities of given gene sequences and clusters them to find significant groups. The algorithm was designed to handle large-scale data and can be applied to hundreds of species at one. Details can be found in Lechner et al., BMC Bioinformatics. 2011 Apr 28;12:124.
To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (manuscript in preparation), is already build in Proteinortho.

Proteinortho assumes, that you have all your gene sequences in FASTA format either
represented as amino acids or as nucleotides. The source code archive contains some examples, namely C.faa, E.faa, L.faa, M.faa located in the test/ directory.
I<By default Proteinortho assumes amino> I<acids and thus uses diamond> (-p=diamond) to compare sequences. If you have nucleotide sequences, you need to change this by adding the parameter
-p=blastn+ (or some other algorithm). (In case you have only have NCBI BLAST legacy installed, you need to tell this too - either by adding -p=blastp_legacy or -p=blastn_legacy respectively.)
The full command for the example files would thus be proteinortho6.pl -project=test test/C.faa test/E.faa test/L.faa test/M.faa.
Instead of naming the FASTA files one by one, you could also supply test/*.faa as argument.
Please note that the parameter -project=test is optional. With this, you can set the prefix of the output files generated by Proteinortho.
If you skip the project parameter, the default project name will be myproject.

=head1 OPTIONS

=head2 Main parameters

use the GUI proteinorthoHelper.html (downloadable at https://gitlab.com/paulklemm_PHD/proteinortho/raw/master/proteinorthoHelper.html?inline=false)

=over 4

=item B<--step>={0,1,2,3} (default: 0)

0 -> all. 1 -> prepare blast (build db). 2 -> run all-versus-all blast. 3 -> run the clustering.

=item B<--project>=name (default: myproject)

prefix for all resulting file names

=item B<--inproject>=name (default: same as project)

load data from this namespace instead (works with intermediate files for step=2 and blast-graph for step=3)

=item B<--keep>

stores temporary blast results for reuse (proteinortho_cache_project directory). 
In a second run the intermediate blast results are loaded instead of calculated.
You can adjust the parameters e.g. a more strict -e evalue cut off and write the output to a different namespace using --inproject.

=item B<--cpus>=number (default: all available)

the number of processors to use (multicore/processor support) for step 2 and step 3

=item B<--threads_per_process>=number (default: 1)

number of threads per process, 
e.g. using -cpus=4 and -threads_per_process=2 will spawn 4 workerthreads using 2 cpu cores each = total of 8 cores

=item B<--verbose>={0,1,2} (default: 1)

verbose level. 1:keeps you informed about the progress

=item B<--silent>

sets the verbose level to 0.

=item B<--temp>=directory(.)

path to the temporary files

=item B<--force>

forces the recalculation of the blast results in any case in step=2. Also forces the recreation of the database generation in step=1

=item B<--clean>

removes all database files generated by the -p= algorithm afterwards

=back

=item B<--isoform>

Enables the isoform processing:
ncbi -> if the word 'isoform' is found 
uniprot -> 'Isoform of XYZ' (You need to add the *_additional.fasta files to the analysis)
trinity -> using '_iX' suffix

All isoforms are united to a single entity and treated as one protein in the RBH algorithm. 
E.g. Let I1 and I2 two isoforms, I1 reciprocally matches A and I2 B but no other hits are found: with this option the output is A-I-B (results of I1 and I2 are merged)
Extracting a group with an isoform will result in all isoforms.
For more information have a look at: 
https://gitlab.com/paulklemm_PHD/proteinortho/-/wikis/FAQ#how-does-the-isoform-work

=back

=head2 Search options (step 1-2)

(output: <myproject>.blast-graph)

=over 4

=item B<--p>=algorithm (default: diamond)

B<autoblast> : automatically detects the blast+ program (blastp,blastn,tblastn,blastx) depending on the input (can also be mixed together!).

B<blastn>,B<blastp>,B<tblastx>,B<blastn+>,B<blastp+>,B<tblastx+> : standard blast family. The suffix 'n' or 'p' indicates nucleotide or protein version (of the input files). Use *_legacy for legacy blast.

B<diamond> : Only for protein files! standard diamond procedure and for genes/proteins of length >40 with the additional --sensitive flag

B<lastn>,B<lastp> : lastal. -n : dna files, -p protein files (BLOSUM62 scoring matrix)!

B<rapsearch> : Only for protein files!

B<mmseqsp>,B<mmseqsn> : mmseqs2.

B<topaz> : Only for protein files!

B<usearch> : usearch_local procedure with -id 0 (minimum identity percentage).

B<ublast> : usearch_ublast procedure.

B<blatp>,B<blatn> : blat. -n : dna files, -p protein files

=item B<--e>=evalue (default: 1e-05)

E-value for blast
(column 11 of blast outfmt 6 output)

=item B<--selfblast>

apply selfblast, detects paralogs without orthologs

=item B<--sim>=float (default: 0.95)

The -sim relaxes the one best reciprocal hit to all adaptive reciprocal best hits. 
E.g. for -sim=0.9 all reciprocal hits within 90% of the highest bitscore are returned. 
This can reduces false negative on the cost of maybe increasing the false positives too.
-sim=1 will result in only the best reciprocal hits. In the most case this correspond to obe-to-one orthologs.
-sim=0 will result in all reciprocal hits within the evalue threshold.

=item B<--identity>=number (default: 25)

min. percent identity of best blast hits
(column 3 (pident) of blast outfmt 6 output)

=item B<--cov>=number (default: 50)

min. coverage of best blast alignments in %
coverage between protein A and B = min ( alignment_length_A_B/length_A, alignment_length_A_B/length_B )
(alignment_length_A_B = column 4 of blast outfmt 6 output)

=item B<--subparaBlast,--subpara>='options'

additional parameters for the search tool (-p=blast,diamond,...) example -subpara='-seg no' or -subpara='--more-sensitive' for diamond

=item B<--range>=number (default: -1 = disabled)

maximal length difference for any blast hit. 0 = filter for hits of same length

=item B<--identical>

only return entries that are 100% identical

=back

=item B<--subparaMakeBlast>='options'

additional parameters for the database generation (-p=blast -> makeblastdb,diamond -> diamond makedb,...) example -subpara='--ignore-warnings' for a common diamond error

=back

=head2 Synteny options (optional, step 2)

(output: <myproject>.ffadj-graph, <myproject>.poff-graph.tsv)

=over 4

=item B<--synteny>

activate PoFF extension to separate similar by contextual adjacencies (requires .gff for each .fasta)
Each protein with the ID 'XXX' needs to be refered in the gff file with the attribute 'Name=XXX'
For more information see 'https://gitlab.com/paulklemm_PHD/proteinortho#poff'

=item B<--dups>=number (default: 0)

PoFF: number of reiterations for adjacencies heuristic, to determine duplicated regions

=item B<--cs>=number (default: 3)

PoFF: Size of a maximum common substring (MCS) for adjacency matches

=item B<--alpha>=number (default: .5)

PoFF: weight of adjacencies vs. sequence similarity

=back

=head2 Clustering options (step 3)

(output: <myproject>.proteinortho.tsv, <myproject>.proteinortho.html, <myproject>.proteinortho-graph)

=over 4

=item B<--singles>

report singleton genes without any hit

=item B<--conn>=float (default: 0.1)

min. algebraic connectivity
This is the main parameter for the clustering step. 
Choose larger values then more splits are done, resulting in more and smaller clusters. (There are still cluster with an alg. conn. below this given threshold allowed if the protein to species ratio is good enough, see -minspecies option below)

=item B<--minspecies>=float (default: 1, must be >=0)

min. number of genes per species. If a group is found with up to (minspecies) genes/species, it wont be split again (regardless of the connectivity).

=item B<--nograph>

do not generate .graph file (pairwise orthology relations)

=item B<--core>

stop clustering if a split would result in groups that do not span at least -coreMinSpecies species. Overrules the -conn threshold.

=item B<--coreMinSpecies>

sets the minimal number of species for the -core option (default:0 = disabled)

=item B<--coreMaxProt>

sets the maximal number of proteins per species for the -core option (default:100)

=item B<--subparaCluster>='options'

additional parameters for the clustering algorithm (proteinortho_clustering) example -subparaCluster='-maxnodes 10000'.
Note: -rmgraph cannot be set. All other parameters of subparaCluster are replacing the default values (like -cpus or -minSpecies)

=item B<--xml>

do generate an orthoXML file (see http://www.orthoxml.org for more information). You can also use proteinortho2xml.pl <myproject.proteinortho>.

=back

=head2 Misc options

=over 4

=item B<--checkfasta>

checks input fasta files if the given algorithm can process the given fasta file.

=item B<--cleanblast>

cleans blast-graph with proteinortho_cleanupblastgraph (either generated in step 2 or loaded in step 3)

=item B<--desc>

write description files (for NCBI FASTA input only)

=item B<--binpath>=directory (default: PATH)

path to your local executables (blast, diamond, mcl, ...)

=item B<--debug>

gives detailed information for bug tracking

=back

=head2 Large compute jobs

=over 4

=item B<--jobs>=M/N

If you want to involve multiple machines or separate a Proteinortho run into smaller chunks, use the -jobs=B<M>/B<N> option.
First, run 'proteinortho6.pl -steps=1 ...' to generate the indices.
Then you can run 'proteinortho6.pl -steps=2 -jobs=B<M>/B<N> ...' to run small chunks separately.
Instead of B<M> and B<N> numbers must be set representing the number of jobs you want to divide the run into (B<M>) and the job division to be performed by the process.
E.g. to divide a Proteinortho run into 4 jobs to run on several machines, use

=back

=head1 Output

=head2 BLAST Search (step 1-2)

=over 4

=item <myproject>.B<blast-graph>

filtered raw blast data based on adaptive reciprocal best blast
matches (= reciprocal best match plus all reciprocal matches within a
range of 95% by default) The first two rows are just comments
explaining the meaning of each row. Whenever a further comment line (starting
with #) follows, it indicates results comparing the two species is
about to follow. E.g. # M.faa L.faa tells that the next lines representing
results for species M and L. All matches are reciprocal matches. If
e.g. a match for M_15 L_15 is shown, L_15 M_15 exists implicitly.
E-Values and bit scores for both directions are given behind each
match. The 4 comment numbers ('# 3.8e-124        434.9...') are representing the median values of
evalue_ab, bitscore_ab, evalue_ba and bitscore_ba.

  ----------------------------------------
  # file_a  file_b
  # a b evalue_ab bitscore_ab evalue_ba bitscore_ba
  # E.faa C.faa
  # 3.8e-124        434.9   2.8e-126        442.2
  E_10  C_10  3.8e-124  434.9 2.8e-126  442.2
  E_11  C_11  5.9e-51 190.7 5.6e-50 187.6
  ----------------------------------------

=back

=head2 Clustering (step 3)

=over 4

=item <myproject>.B<proteinortho-graph>

clustered myproject.blast-graph. Its connected components are represented in myproject.proteinortho.tsv. The format is the same as the blast-graph (see above).

  ----------------------------------------
  # file_a  file_b
  # a b evalue_ab bitscore_ab evalue_ba bitscore_ba
  # E.faa C.faa
  E_10  C_10  3.8e-124  434.9 2.8e-126  442.2
  E_11  C_11  5.9e-51 190.7 5.6e-50 187.6
  ----------------------------------------

=item <myproject>.B<proteinortho>

The connected components.
The first line starting with #is a comment line indicating the meaning of each column for each of the following lines which represent an orthologous group each.
The very first column indicates the number of species covered by this group. The second column indicates the number of genes included in the group.
Often, this number will equal the number of species, meaning that there is a single ortholog in each species.
If the number of genes is bigger than the number of species, there are co-orthologs present.
The third column gives rise to the algebraic connectivity of the respective group. Basically, this indicates how densely the genes are connected in the orthology graph that was used for clustering.
A connectivity of 1 indicates a perfect dense cluster with each gene similar to each other gene.
By default, Proteinortho splits each group into two more dense subgroups when the connectivity is below 0.1

  ----------------------------------------
  # Species Genes Alg.-Conn.  C.faa C2.faa  E.faa L.faa M.faa
  2 5 0.16  * * * L_643,L_641 M_649,M_640,M_642
  3 6 0.138 C_164,C_166,C_167,C_2 * * L_2 M_2
  2 4 0.489 * * * L_645,L_647 M_644,M_646
  ----------------------------------------

=back

=head2 POFF (-synteny)

The synteny based graph files (myproject.ffadj-graph and myproject.poff-graph) have two additional columns:
same_strand and simscore. The first one indicates if two genes from a match are located at the same strands (1) or not (-1).
The second one is an internal score which can be interpreted as a normalized weight ranging from 0 to 1 based on the respective e-values.
Moreover, a second comment line is followed after the species lines, e.g.

  ----------------------------------------
  # M.faa L.faa
  # Scores: 4     39      34.000000       39.000000
  ----------------------------------------

=over 4

=item <myproject>.B>ffadj-graph>

filtered blast data based on adaptive reciprocal best blast matches and synteny (only if -synteny is set)

=item <myproject>.B<poff-graph>

clustered ffadj graph. Its connected components are represented in myproject.poff.tsv (only if -synteny is set)

=back

=head1 EXAMPLES

=head2 Calling proteinortho

Sequences are typically given in plain fasta format like the files in test/

  test/C.faa:
  ----------------------------------------
  >C_10
  VVLCRYEIGGLAQVLDTQFDMYTNCHKMCSADSQVTYKEAANLTARVTTDRQKEPLTGGY
  HGAKLGFLGCSLLRSRDYGYPEQNFHAKTDLFALPMGDHYCGDEGSGNAYLCDFDNQYGR
  ...
  ----------------------------------------

  test/E.faa:
  ----------------------------------------
  >E_10
  CVLDNYQIALLRNVLPKLFMTKNFIEGMCGGGGEENYKAMTRATAKSTTDNQNAPLSGGF
  NDGKMGTGCLPSAAKNYKYPENAVSGASNLYALIVGESYCGDENDDKAYLCDVNQYAPNV
  ...
  ----------------------------------------

To run proteinortho for these sequences, simply call

  proteinortho6.pl test/C.faa test/E.faa test/L.faa test/M.faa

To give the outputs the name 'test', call

  proteinortho6.pl -project=test test/*faa

To use blast instead of the default diamond, call

  proteinortho6.pl -project=test -p=blastp+ test/*faa

=head2 POFF

The PoFF extension allows you to use the relative order of genes (synteny) as an additional criterion to disentangle complex co-orthology relations.
To do so, add the parameter -synteny.
You can use it to either come closer to one-to-one orthology relations by preferring synthetically conserved copies in the presence of two very similar paralogs (default),
or just to reduce noise in the predictions by detecting multiple copies of genomic areas (add the parameter -dups=3).
Please note that you need additional data to include synteny, namely the gene positions in GFF3 format.
As Proteinortho is primarily made for proteins, it will only accept GFF entries of type CDS (column #3 in the GFF-file).
The attributes column (#9) must contain Name=GENE IDENTIFIER where GENE IDENTIFIER corresponds to the respective identifier in the FASTA format.
It may not contain a semicolon (;)! Alternatively, you can also set ID=GENE IDENTIFIER.
Example files are provided in the source code archive.
Hence, we can run proteinortho6.pl -project=test -synteny test/A1.faa test/B1.faa test/E1.faa test/F1.faa to add synteny information to the calculations.
Of course, this only makes sense if species are sufficiently similar. You won't gain much when comparing e.g. bacteria with fungi.
When the analysis is done you will find an additional file in your current working directory, namely test.poff.tsv.
This file is equivalent to the .proteinortho.tsv file (above) but can be considered more accurate as synteny was involved for its construction.

=head1 Hints

Using .faa to indicate that your file contains amino acids and .fna to show it contains nucleotides makes life much easier.

Sequence IDs must be unique within a single FASTA file. Consider renaming otherwise. Note: Till version 5.15 sequences IDs had to be unique among the whole dataset. Proteinortho now keeps track of name and species to avoid the necessissity of renaming.

The directory src contains useful tools, e.g. proteinortho_grab_proteins.pl which fetches protein sequences of orthologous groups from Proteinortho output table

=head1 AUTHORS

Marcus Lechner (lechner\@staff.uni-marburg.de), Clemens Elias Thoelken, Paul Klemm

=head1 ONLINE INFORMATION

For download and online information, see
L<https://www.bioinf.uni-leipzig.de/Software/proteinortho/>

=head1 REFERENCES

Lechner, M., Findeisz, S., Steiner, L., Marz, M., Stadler, P. F., & Prohaska, S. J. (2011). Proteinortho: detection of (co-) orthologs in large-scale analysis. BMC bioinformatics, 12(1), 124.

=cut

# =item B<--mcl>
# enables the mcl algorithm for clustering instead of power iteration or lapacks routine. (needs mcl to be installed, call 'mcl' to test if you installed mcl correctly)
# =item B<--ram>=number (default: 75% of free memory)
# maximal used ram threshold for LAPACK and the input graph in MB. By default 75% of the free memory is used
# =item B<--purity>=float (>0, default: choose value automatically)
# avoid spurious graph assignments, the lower the purity threshold the more uncertain edges are cut

##########################################################################################
# Imports
##########################################################################################
use strict;
use warnings "all";
use threads;
use threads::shared;
use Thread::Queue;
use File::Basename;
use Cwd;
use Cwd 'abs_path';
use Pod::Usage; # --man
use POSIX;
#use Term::ReadKey; # press c to cancel, any other key to continue
##########################################################################################
# Variables
##########################################################################################
our $version = "6.1.7";
our $step = 0;    # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
our $verbose = 1; # 0/1   -> don't / be verbose
our $core = 0; 
our $coreMinSpecies=0;
our $coreMaxProts=100;
our $debug = 0;   # 0/1   -> don't / show debug data
our $exactstep3 = 0;
#our $reflexiv = 0; # 0/1   -> check sets against themselves
our $synteny = 0; # 0/1   -> Apply synteny algorithm
our $neighbourjoin = 0; # 0/1   -> Merge neighbours
our $duplication = 2; # 0-9 not 1 -> Repeats for duplication extension
our $cs = 3;  # int   -> cs-value
our $alpha = 0.5; # Alpha value for src/proteinortho_ffadj_mcs.py
our $connectivity = "0.1"; # min algebraic connectivity (normal eigenvalue) for clustering step (-step=3), not the evalue cutoff for convergence
our $cpus   = 0;  # 0 = autodetect
our $evalue = "1e-05"; # evalue cutoff for blast step (-step=2), different to the clustering evalue cutoff (set in proteinortho_clustering.cpp)
our $purity = "-1"; # for clustering step: if entries (absolute value) of the vector are blow the purity -> treated as '0'
our $coverage = 50; # Percent coverage threshold for two proteins
our $identity = 25; # Percent identity threshold for two proteins
our $blastmode = "diamond";
#our $tmpdir = "./";  # Dir for tmp-files
our $sim = 0.95;
our $report = 3;
our $startat = undef; # removed 5.16
our $stopat = undef;  # removed 5.16
our $keep = 0;
our $force = 0;
our $selfblast = 0;
our $twilight = 0;
our $singles = 0;
our $clean = 0;
our $blastOptions = "";
our $makeBlastOptions = "";
our $clusterOptions = "";
our $nograph = 0;
our $doxml = 0;
our $desc = 0;
our $range = -1;
our $tmp_path = "";
our $useMcl = 0;
$|=1; #autoflush

# Internal
our $blastversion = "unknown";  # Auto-detected blastmode version
our $binpath = "";
our $makedb = "";   # makedb command depends on blastmode
our $blast = "";    # blast command
our $jobque = Thread::Queue->new(); # Jobs todo
our $check_files_QUEUE = Thread::Queue->new();
our $step1_QUEUE = Thread::Queue->new();
our $log_QUEUE = Thread::Queue->new();
our $do_log :shared;
our $jobs_done :shared = 0;    # Counter
our $jobs_todo = 0;     # Sum of jobs
our $project = "myproject";   # Project name
our $inproject = "";   # empty = set to $project
our $graph_lock :shared;
our $syn_lock :shared;
our $all_jobs_submitted :shared = 0;
our $po_path = "";
our $run_id = "";
our $hmm_search_evalue = "1e-100";
our $hmm_minprots = 15; 
our %gene_counter :shared;    # Holds the number of genes for each data file (for sorting)
our %max_gene_length_diamond :shared;    # Holds the maximum length of genes for each data file (for diamond -> -sensitive option)
our $threads_per_process :shared = 1; # Number of subthreads for blast
our $identical=0;

our $freemem_inMB = -1; # -1 = detect automatically with 'free -m'

# Split work
our $split_to_X_jobs = -1;
our $jobnumber = -1;
our $part = -1;

our $checkblast=0;
our $checkfasta=0;
our $minspecies=1;
our $isoform="";

our $RED="\033[1;31m";
our $RED2="\033[1;31m";
our $GREEN="\033[1;32m";
our $BLUE="\033[1;36m";
our $ORANGE="\033[1;33m";
our $NC="\033[0m"; # No Color
our %isoform_mapping :shared;

my $tput=`tput color 2>/dev/null`; # test if the shell supports colors
$tput=~s/[\r\n]+$//;
if ($tput=~m/[^0-9]|^&/ && $tput <16) {
  $RED="";
  $GREEN="";
  $ORANGE="";
  $NC="";
}

##########################################################################################
# Parameters
##########################################################################################
our @files = ();
our @files_cleanup = ();
our %files_map;
foreach my $option (@ARGV) {
  if ($option =~ m/^--?step=(0|1|2|3|4)$/)      { $step = $1;   }
  elsif ($option =~ m/^--?verbose$/)      { $verbose = 1;  }
  elsif ($option =~ m/^--?verbose=([012])$/)      { $verbose = $1;  }
  elsif ($option =~ m/^--?silent$/)      { $verbose = 0;  }
  elsif ($option =~ m/^--?core$/)        { $core = 1;  }
  elsif ($option =~ m/^--?coreMinSpecies=([0-9]+)$/)      { $coreMinSpecies = $1;  }
  elsif ($option =~ m/^--?coreMaxProts=([0-9]+)$/)      { $coreMaxProts = $1;  }
  elsif ($option =~ m/^--?unique$/)       { $checkblast = 1;  }
  elsif ($option =~ m/^--?checkblastgraph$/)      { $checkblast = 1;  }
  elsif ($option =~ m/^--?checkblast$/)       { $checkblast = 1;  }
  elsif ($option =~ m/^--?checkfasta$/)       { $checkfasta = 1;  }
  elsif ($option =~ m/^--?check$/)       { $checkfasta = 1; $checkfasta = 1; }
  elsif ($option =~ m/^--?cleanblast$/)       { $checkblast = 1;  }
  elsif ($option =~ m/^--?cleanupblast$/)       { $checkblast = 1;  }
  elsif ($option =~ m/^--?te?mp=(.+)$/)     { $tmp_path = $1;
                # make sure it ends with /
                unless ($tmp_path =~ /\/$/) {$tmp_path .= "/";}
                my $pwd=$ENV{"HOME"}; $tmp_path=~s/~/$pwd/g;
                if(! -d $tmp_path || ! -R $tmp_path || ! -W $tmp_path ){
                  &Error(" -tmp=$tmp_path is not accessible. Check if the directory exists and is read- and writable.");
                }else{
                  if($tmp_path =~ m/proteinortho_cache/){ $keep=1; }
                }
              }
  elsif ($option =~ m/^--?debug$/)          { $debug = 1;  }
  elsif ($option =~ m/^--?exactstep3$/)           { $exactstep3 = 1;  }
  elsif ($option =~ m/^--?debug=([\da-zA-Z_]*)$/)         { $debug = $1;  }
  elsif ($option =~ m/^--?p=(.*)$/)         { $blastmode = $1; if($blastmode eq "blastn"){$blastmode.="+";} if($blastmode eq "blastp"){$blastmode.="+";}}
  elsif ($option =~ m/^--?e=(.*)$/)       { $evalue = $1; }
  elsif ($option =~ m/^--?range=([0-9]+)$/)       { $range = $1; }
  elsif ($option =~ m/^--?cpus=(\d*)$/)       { $cpus = $1; }
  elsif ($option =~ m/^--?threads_per_process=(\d*)$/)       { $threads_per_process = $1; }
  elsif ($option =~ m/^--?cpus=auto$/)          { $cpus = 0; }
  elsif ($option =~ m/^--?alpha=([0-9\.]+)$/)     { $alpha = $1; }
  elsif ($option =~ m/^--?hmm_?evalue=([0-9\.eE+-]+)$/)     { $hmm_search_evalue = $1; }
  elsif ($option =~ m/^--?hmm_?e=([0-9\.eE+-]+)$/)     { $hmm_search_evalue = $1; }
  elsif ($option =~ m/^--?hmm_?minprots?=([0-9]+)$/)     { $hmm_minprots = $1; }
  elsif ($option =~ m/^--?purity=([0-9\.]+)$/)      { print STDERR "$ORANGE"."[WARNING]$NC The --memory option is deprecated\n"; } # $purity = $1; 
  elsif ($option =~ m/^--?report=([0-9]+)$/)      { $report = $1; }
  elsif ($option =~ m/^--?minspecies=([0-9.]+)$/)       { if($1>=0){$minspecies = $1;}else{&Error("the argument -minspecies=$1 is invalid.$RED minspecies needs to be >=0!$NC\nminspecies: the min. number of genes per species. If a group is found with up to (minspecies) genes/species, it wont be split again (regardless of the connectivity).");} }
  elsif ($option =~ m/^--?conn=([0-9\.]+)$/)      { $connectivity = $1; }
  elsif ($option =~ m/^--?cov=([0-9.]+)$/)       { $coverage = $1; if($coverage > 100 || $coverage < 0){print STDERR "$ORANGE"."[ERROR]$NC The --coverage parameter is out of range \n";exit 1;} if($coverage<1 && $coverage > 0){print STDERR "$ORANGE"."[WARNING]$NC The range of --cov is between 0 and 100 (percentage), it looks like you set this to a value between 0 and 1, I will correct the input to ".($coverage*100)."\n"; $coverage=$coverage*100;} }
  elsif ($option =~ m/^--?mcl$/)        { $useMcl = 1; }
  elsif ($option =~ m/^--?binpath=(.+)$/)     { $binpath  = $1."/"; if($binpath=~m/~/){my $pwd=$ENV{"HOME"}; $binpath=~s/~/$pwd/;} $binpath=~s/\/\/+/\//g; }
  elsif ($option =~ m/^--?identity=([0-9]+)$/)      { $identity = $1; }
  elsif ($option =~ m/^--?memory=([0-9]+)$/)      { $freemem_inMB = $1; print STDERR "$ORANGE"."[WARNING]$NC The --memory option is deprecated\n"; }
  elsif ($option =~ m/^--?mem=([0-9]+)$/)     { $freemem_inMB = $1; print STDERR "$ORANGE"."[WARNING]$NC The --mem option is deprecated\n"; }
  elsif ($option =~ m/^--?freememory=([0-9]+)$/)      { $freemem_inMB = $1; }
  elsif ($option =~ m/^--?ram=([0-9]+)$/)     { $freemem_inMB = $1; print STDERR "$ORANGE"."[WARNING]$NC The --ram option is deprecated\n"; }
  elsif ($option =~ m/^--?identity=twilight$/)    { $twilight = 1; }
  elsif ($option =~ m/^--?sim=([0-9\.]+)$/)       { $sim = $1; if($sim > 100 || $sim < 0){print STDERR "$ORANGE"."[ERROR]$NC The --sim parameter is out of range \n";exit 1;} if($sim>1 && $sim <= 100){print STDERR "$ORANGE"."[WARNING]$NC The range of --sim is between 0 and 1, it looks like you set this to a percent value, I will correct the input to ".($sim/100)."\n"; $sim=$sim/100;} }
  elsif ($option =~ m/^--?startat=([0-9]+)$/)     { $startat = $1; }
  elsif ($option =~ m/^--?stopat=([0-9]+)$/)      { $stopat = $1; }
  elsif ($option =~ m/^--?jobs?=([\d]+)\/([\d]+)$/)   { $jobnumber = $1; $split_to_X_jobs = $2; }
  elsif ($option =~ m/^--?selfblast$/)      { $selfblast = 1; }
  elsif ($option =~ m/^--?selfblast=(0|1)$/)    { $selfblast = $1; }
  elsif ($option =~ m/^--?singles$/)      { $singles = 1; }
  elsif ($option =~ m/^--?isoform$/)      { print STDERR "$ORANGE"."[ERROR]$NC The --isoform option needs one of the following arguments: 'ncbi','uniprot','trinity' (e.g. --isoform='uniprot' for an uniprot style -> *_additional.faa files) \n";die; }
  elsif ($option =~ m/^--?isoform=(trinity|uniprot|ncbi)$/)      { $isoform = $1; }
  elsif ($option =~ m/^--?singles=(0|1)$/)    { $singles = $1; }
  elsif ($option =~ m/^--?poff$/)       { $synteny = 1;  } #print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---";
  elsif ($option =~ m/^--?synteny$/)      { $synteny = 1;  } #print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; 
  elsif ($option =~ m/^--?synteny=(0|1)$/)    { $synteny = $1; } #print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; 
  elsif ($option =~ m/^--?dups=0$/)     { $duplication = 0; }
  elsif ($option =~ m/^--?dups=([1-8])$/)   { $duplication = $1+1;}
  elsif ($option =~ m/^--?neighbourjoin$/)    { $neighbourjoin = 1; }
  elsif ($option =~ m/^--?neighbourjoin=(0|1)$/)  { $neighbourjoin = $1; }
  elsif ($option =~ m/^--?cs=([0-9]+)$/)    { $cs = $1; }
  elsif ($option =~ m/^--?keep$/)     { $keep = 1; }
  elsif ($option =~ m/^--?identical$/)     { $identical = 1; }
  elsif ($option =~ m/^--?force$/)      { $force = 1; }
  elsif ($option =~ m/^--?clean$/)     { $clean = 1; }
  elsif ($option =~ m/^--?help$/)     { &print_header;print_usage_more(); exit 0;}
  elsif ($option =~ m/^--?test$/)     { &check_bins; &get_po_path; print "All necessary proteinortho_* binaries are found.\n"; exit 0;}
  elsif ($option =~ m/^--?h$/)     { &print_header;print_usage_more(); exit 0;}
  elsif ($option =~ m/^--?nograph$/)      { $nograph = 1; }
  elsif ($option =~ m/^--?xml$/)      { $doxml = 1; }
  elsif ($option =~ m/^--?graph$/)      { $nograph = 0; }
  elsif ($option =~ m/^--?desc$/)     { $desc = 1; }
  elsif ($option =~ m/^--?project=(.*)$/)   { $project = $1; $project=~s/[\/* \t\:\~\&\%\$\§\"\(\)\[\]\{\}\^\\]//g; }
  elsif ($option =~ m/^--?inproject=(.*)$/)   { $inproject = $1; $inproject=~s/[\/* \t\:\~\&\%\$\§\"\(\)\[\]\{\}\^\\]//g; }
  elsif ($option =~ m/^--?(subparaBlast|subpara)=(.*)$/i)  { $blastOptions = $2;}
  elsif ($option =~ m/^--?(subparaMakeBlast)=(.*)$/i)  { $makeBlastOptions = $2;}
  elsif ($option =~ m/^--?subparaCluster=(.*)$/i)  { $clusterOptions = $1;}
  elsif ($option =~ m/^--?v(ersion)?$/i)  { print $version."\n"; exit 0;}
  elsif ($option !~ /^-/)       { if(!exists($files_map{$option})){$files_map{$option}=1;push(@files,$option);}else{print STDERR "$ORANGE"."[WARNING]$NC The input $option was is skipped, since it was allready given as input.$NC\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n";sleep 10;print STDERR "Well then, proceeding...\n"} }
  elsif ($option =~ m/^--?man$/i){ my $bperldoc=(1-(`perldoc -l Pod::Usage 2>&1`=~m/need to install/)); if( !$bperldoc ){ print STDERR "You need to install the perl-doc package to use this man program.\n"; }else{ pod2usage(-exitstatus => 0, -verbose => 2) } exit 0; }
  else  {&print_usage_more(); &reset_locale();die "$RED"."[Error]$NC $ORANGE Invalid command line option: \'$option\'! $NC\n\n"; }
}
if($inproject eq ""){$inproject=$project} # if --inproject is not specified then set it to --project

# for no input -> print usage 
if(scalar @ARGV == 0 && $step == 0){ &print_header;print_usage_more(); exit 1; }

if($jobnumber != -1 && $tmp_path eq "" ){print STDERR "Please consider to use the -tmp option in combination with -jobs if you use proteinortho on a computing cluster system. Usually these clusters have dedicated local scratch directories.\n";}
if($selfblast){$checkblast=1;}

$po_path = &get_po_path();    # Determine local path

our $nucleotideAlphabet="ACGTNUacgtnu\.\-"; #ACGTURYSWKMBDHVNXacgturyswkmbdhvnx
our $aminoAlphabet="XOUBZACDEFGHIKLMNPQRSTVWYxoubzacdefghiklmnpqrstvwy\.\*\-";
our %autoblast_fileis :shared;
our $allowedAlphabet = {
  'blastn_legacy' => 'n' ,
  'blastp_legacy' => 'a' ,
  'tblastx' => 'n' ,
  'blastn+' => 'n' ,
  'blastp+' => 'a' ,
  'tblastx+' => 'n' ,
  'diamond' => 'a' ,
  'rapsearch' => 'a' ,
  'blatp' => 'a' ,
  'blatn' => 'n' ,
  'blatx' => 'n' ,
  'mmseqsn' => 'n' ,
  'mmseqsp' => 'a' ,
  'lastp' => 'a' ,
  'lastn' => 'n' ,
  'topaz' => 'a' };
##MARK_FOR_NEW_BLAST_ALGORITHM
our $blastmode_pendant = { # if you choose blastp+ and the input is nucleotide -> choose the pendant blastn+ and restart (only if you check files with -checkfasta)
  'blastn_legacy' => 'blastp_legacy' ,
  'blastp_legacy' => 'blastn_legacy' ,
  'blastn+' => 'blastp+' ,
  'blastp+' => 'blastn+' ,
  'rapsearch' => 'a' ,
  'blatp' => 'blatn' ,
  'blatn' => 'blatp' ,
  'mmseqsn' => 'mmseqsp' ,
  'mmseqsp' => 'mmseqsn' ,
  'lastp' => 'lastn' ,
  'lastn' => 'lastp' };
##MARK_FOR_NEW_BLAST_ALGORITHM

my $restart_counter=-1; # restart only once

our $proteinortho_exec='proteinortho_exec(){ # works with sh as well as bash (not csh)
  max_monitor_time=25920; # 3 days in 10s (sleep_time is 10s)
  max_restarts=10; # if max restarts are reached -> terminate with exit 1
  sleep_time=10;
  max_sus=60; # 10 min of suspension = restart
  restart_counter=0;

  work(){ 
    # fork the current process into a monitor and the input cmd
    # for debug could use this: force a OK after first run : #if [ "$restart_counter" -eq "1" ]; then cmd="sleep 1"; fi 

    eval "$cmd &"; # execute the input command in the background
    pid=$!;
    '.($debug ? 'echo "[main] started $pid";': "").'
    
    trap \'max_monitor_time=0; kill -1 $pid 2>/dev/null; wait $pid 2>/dev/null;\' 2 9 EXIT;

    (
      # this is the monitor script, every update interval check if cmd is suspended or not -> if too many suspended calls are found terminate this monitor as well as the cmd call
      it=0; 
      is_sus=0;
      sus_counter=0;
      sleep $sleep_time;
      '.($debug ? 'echo "[monitor] starting $pid";': "").'
      while [ "$it" -lt "$max_monitor_time" ] & [ "$pid" != "-1" ] & kill -0 $pid 2>/dev/null; do 
        # test of the cmd command has 0 cpu usage -> if yes then increment the sus_counter
        if [ "$(ps -o pcpu $pid | tail -n1 | grep " 0.")" != "" ];then is_sus=1; sus_counter=$((sus_counter+1)); else sus_counter=0; fi;
        '.($debug ? 'ps -o pcpu $pid; echo "[monitor] $pid=is_sus:$is_sus sus_counter:$sus_counter"': "").'
        if [ "$sus_counter" -gt "$max_sus" ]; then '.($debug ? 'echo "max_sus reached : kill $pid"': "").'
          kill $pid 2>/dev/null;
          wait $pid 2>/dev/null;
          return 1; # signal FAIL -> max_sus is reached
        fi;
        sleep $sleep_time; it=$((it+1)); 
      done
      '.($debug ? 'echo "[monitor] all ok $pid";': "").'
      return 0; # signal OK
    ) &
    pid_monitor=$!;

    wait $pid; # wait on a finish from cmd -> if the monitor stops, this will halt as well
    ret_pid=$?;
    
    if [ "$ret_pid" -eq "0" ]; then # command did run with OK output -> terminate this script (if monitor stops: this will never be 0)
      if kill -0 $pid_monitor 2>/dev/null; then kill $pid_monitor 2>/dev/null; wait $pid_monitor 2>/dev/null; fi; # kill the monitor if it is running
      return $ret_pid; 
    fi;

    wait $pid_monitor; # await the monitor (one last sleep tick)
    ret_monitor=$?; # this will be 1 if max_sus is reached

    '.($debug ? 'echo "[$restart_counter] pid:$pid (monitor=$pid_monitor) => ret_pid=$ret_pid, ret_monitor=$ret_monitor"': "").'

    if [ "$ret_monitor" != "0" ]; then # restart the process -> another work() call
      if [ "$restart_counter" -lt "$max_restarts" ];then
        restart_counter=$((restart_counter+1));
        '.($debug ? 'echo "[main] $pid restart (restart_counter=$restart_counter) return work";': "").'
        return &work;
      else
        '.($debug ? 'echo "[main] $pid max_restarts reached return 1";': "").'
        return 1; # max restarts are reached -> terminate with 1
      fi
    else
      '.($debug ? 'echo "[main] $pid return $ret_pid";': "").'
      return $ret_pid; # monitor was fine but the program did terminate -> passthrough
    fi
  }

  if [ "$1" != "" ]; then
    cmd=$1;
    pid=-1;

    precheck=1;
    if ! command -v ps >/dev/null 2>&1; then precheck=0; fi;
    if ! command -v grep >/dev/null 2>&1; then precheck=0; fi;
    if ! command -v tail >/dev/null 2>&1; then precheck=0; fi;
    if ! command -v kill >/dev/null 2>&1; then precheck=0; fi;
    if ! command -v wait >/dev/null 2>&1; then precheck=0; fi;

    if [ "$precheck" -eq "1" ]; then
      #trap \'max_monitor_time=0; echo "KILL"; jobs -p; jobs -p | xargs kill -1 2>/dev/null; echo "KILL DONE";jobs -p; exit\' 2 9 EXIT;
      exit &work;
    else
      '.($debug ? 'echo "precheck failed, using now a simple eval call"': "").'
      eval $cmd;
      exit $?;
    fi
  fi
}';

##########################################################################################
# Check parameters
##########################################################################################
if (defined($startat) || defined($stopat)) {
  &Error("Sorry, -startat and -stopat were removed. Please use -jobs=M/N for more flexible job splitting.");
}

if ($identical || $range>0){
  if($blastmode !~ /^blast[pn]\+|diamond?/) {
    &Error("--identical and --range are only supported for diamond, blastp and blastn (you currently have -p=$blastmode)");
  }else{
    if($blastmode eq "diamond"){  # NOTE diamond fails on --outfmt '6 qseqid ...' but is ok with --outfmt 6 qseqid ... 
      $blastOptions="--outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen";
    }else{
      $blastOptions="--outfmt='6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen'";
    }
  }
}

if ($split_to_X_jobs == 0) {
  &Error("Job parameter use incorrectly. Number of jobs cannot be 0. Valid format: M/N");
}

if ($jobnumber != -1 && ($jobnumber == 0 || $jobnumber > $split_to_X_jobs)) {
  &Error("Job parameter use incorrectly. Job number cannot be 0 or greater than number of jobs. Valid format: M/N");
}
if ($jobnumber != -1) {
  if ($step != 2) {&Error("Parameter -jobs only works for step 2!");}
  $run_id = "_$jobnumber"."_".$split_to_X_jobs;
}

#our $gccversionstr = `gcc --version 2>/dev/null`;
#our $gccversion_main = "";
#if ($? == 0) {
#  $gccversion_main=($gccversionstr =~ m/^[^\n]+ ([\d]+)\.[\d\.]+\n/);
#}
#our $ompprocbind="close";
#if($gccversion_main eq "" || $gccversion_main < 5){
#  $ompprocbind="true";
#}

our $simgraph = "$project.blast-graph$run_id";    # Output file graph
our $syngraph = "$project.ffadj-graph$run_id";    # Output file synteny

if($inproject ne $project && -e "$inproject.blast-graph$run_id"){ system("ln -s $inproject.blast-graph$run_id $simgraph"); }
if($inproject ne $project && -e "$inproject.ffadj-graph$run_id"){ system("ln -s $inproject.ffadj-graph$run_id $syngraph"); }

our $csimgraph = "$project.proteinortho-graph";   # Output file graph
our $csyngraph = "$project.poff-graph";       # Output file synteny
our $simtable = "$project.proteinortho.tsv";      # Output file graph
our $simtablehtml = "$project.proteinortho.html";      # Output file graph
our $syntable = "$project.poff.tsv";          # Output file synteny
our $syntablehtml = "$project.poff.html";          # Output file synteny
our $desctable = "$project.descriptions";     # Output file seq descriptions

our $LC_All = $ENV{'LC_All'};
our $LC_NUMERIC = $ENV{'LC_NUMERIC'};

# set the locale enviroment variable such that sort works with scientific numbers
$ENV{'LC_All'}='C';
$ENV{'LC_NUMERIC'}='C';

sub reset_locale{
  $ENV{'LC_NUMERIC'}=$LC_NUMERIC;
  $ENV{'LC_All'}=$LC_All;
}

sub get_parameter{
  return "\nParameter-vector : (",'version',"=$version",",",
      'step',"=$step",",",
      'verbose',"=$verbose",",",
      'debug',"=$debug",",",
      'synteny',"=$synteny",",",
      'duplication',"=$duplication",",",
      'cs',"=$cs",",",
      'alpha',"=$alpha",",",
      'connectivity',"=$connectivity",",",
      'cpus',"=$cpus",",",
      'evalue',"=$evalue",",",
      'purity',"=$purity",",",
      'coverage',"=$coverage",",",
      'identity',"=$identity",",",
      'blastmode',"=$blastmode",",",
      'sim',"=$sim",",",
      'report',"=$report",",",
      'keep',"=$keep",",",
      'force',"=$force",",",
      'selfblast',"=$selfblast",",",
      'twilight',"=$twilight",",",
      'core',"=$core",",",
      'coreMinSpecies',"=$coreMinSpecies",",",
      'coreMaxProts',"=$coreMaxProts",",",
      'identical',"=$identical",",",
      'range',"=$range",",",
      'singles',"=$singles",",",
      'clean',"=$clean",",",
      'blastOptions',"=$blastOptions",",",
      'makeBlastOptions',"=$makeBlastOptions",",",
      'nograph',"=$nograph",",",
      'xml',"=$doxml",",",
      'desc',"=$desc",",",
      'tmp_path',"=$tmp_path,",
      'blastversion',"=$blastversion",",",
      'binpath',"=$binpath",",",
      'makedb',"=$makedb",",",
      'blast',"=$blast",",",
      'jobs_todo',"=$jobs_todo",",",
      'project',"=$project",",",
      'inproject',"=$inproject",",",
      'po_path',"=$po_path",",",
      'run_id',"=$run_id",",",
      'threads_per_process',"=$threads_per_process",",",
      "um","=$useMcl",
#      'freemem',"=$freemem_inMB",
      ")";
}

sub uniq {
    my %seen;
    grep !$seen{$_}++, @_;
}

if (!$force && -e "$project.proteinortho.tsv" && $step!=1 && $step!=4 && ( ( scalar(@files) > 2 && $step!=3 ) || $step==3 ) ) {
  print STDERR "!!!$ORANGE\n[Warning]:$NC Data files for project '$project' already exists. Previous output files might be overwritten.\n$NC";
  print STDERR "Press 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!! (disable with -force)\n";
    sleep 10;
  print STDERR "\nWell then, proceeding...\n\n";
}

# if step=1 -> generate blast db, no need for unlinking results
if($step!=1 && $step!=4){
  if($step!=3){ # if step 3 then dont destroy the blast results ...
    unlink($simgraph);
    unlink($syngraph);
  }
  # destroy all the *.proteinortho.tsv, *.poff, *.html results
  unlink($csimgraph);
  unlink($csyngraph);
  unlink($simtable);
  unlink($simtablehtml);
  unlink($syntable);
  unlink($syntablehtml);
  unlink($desctable);
}

if( $tmp_path eq "" ){$tmp_path=".";}
my $suffixjob="";
if($jobnumber != -1){ $suffixjob = "_${jobnumber}_${jobs_todo}" }
if( $tmp_path !~ qr/proteinortho_cache_$project$suffixjob/ ){
  if(! -d $tmp_path."/proteinortho_cache_$project$suffixjob"){
    system("mkdir $tmp_path/proteinortho_cache_$project$suffixjob");
    if( -d $tmp_path."/proteinortho_cache_$project$suffixjob"){
      $tmp_path.="/proteinortho_cache_$project$suffixjob/";
    }
  }else{
    $tmp_path.="/proteinortho_cache_$project$suffixjob/";
  }
}else{
  if(! -d $tmp_path){ mkdir("$tmp_path"); }
}
if($inproject ne $project){ system("cd '$tmp_path/'; ln -s ../proteinortho_cache_$inproject/* ."); }

our $rm_simgraph = "$tmp_path$project.removed_blast-graph";   # Output remove graph
our $rm_syngraph = "$tmp_path$project.removed_ffadj-graph";   # Output remove graph

##########################################################################################
# (RE)START MARK
##########################################################################################

RESTART:

$restart_counter++;

##########################################################################################
# Run / MAIN
##########################################################################################
if(scalar @ARGV >1){
  open(INFO,">>$project.info") || &Error("Could not open graph '$project.info': $!");
  print INFO "SYSTEMCALL of ".localtime(time()).":\nproteinortho ".join(" ",@ARGV)."\n";
  print INFO &get_parameter;
  print INFO "--------\n";
  close(INFO);
}

&print_header;    # Show Proteinortho Header
&auto_cpus;   # Set number of CPUs
if($step < 3){ # don't check blast-bins (e.g. diamond) for step 3=clustering (not necessary)
  &check_bins;    # Check blast,diamond,... version
}

if( $keep && $cpus > 8 ){ print STDERR "!!!$ORANGE\n[Warning]:$NC The '-keep' option can result in a I/O bottleneck if used with many cores !!!\n$NC"; }

# 6.0.32 check if there are _clean files available -> use it instead. 
sub search_clean_file{
  my $file = shift;
  my $file_clean = $file;
  $file_clean=~s/(\.[^.]+)$/_clean$1/;
  if(-e $file_clean){return $file_clean}
  return $file;
}
if($step < 3 || $step==4){ $_ = search_clean_file($_) for @files; @files = uniq @files }

# do uniprot _additional* file appending for --isoform
if($isoform eq "uniprot"){ 
  if($verbose){print STDERR "preparing files for isoform processing.\n";}

  my @new_files;
  my $iso_counter=0;
  my %files_hash = map {$_ => 1} @files;
  for(my $i = 0 ; $i < scalar @files ; $i++){
    if($files[$i] =~ m/^(.*)_additional.*$/  || $files[$i] =~ m/^(.*)additional.*$/){next;}
    my $foundAdd=0;
    my $i_withoutEnd=$files[$i];
    $i_withoutEnd=~s/\.[^.]+$//g;
    for(my $j = 0 ; $j < scalar @files ; $j++){
      if($files[$j] =~ m/^(.*)_additional.*$/ || $files[$j] =~ m/^(.*)additional.*$/){
        my $prefix=$1;
        if($prefix eq $i_withoutEnd){
          system("cp $files[$i] $tmp_path/".basename($files[$i])."; if [ ! -z \"\$(tail -c 1 \"$tmp_path/".basename($files[$i])."\")\" ]; then echo '' >> $tmp_path/".basename($files[$i])."; fi; cat ".$files[$j]." >>$tmp_path/".basename($files[$i]));
          push(@new_files,"$tmp_path/".basename($files[$i]));
          $iso_counter++;
          $foundAdd=1;
          last;
        }
      }
    }
    if(!$foundAdd){push(@new_files,$files[$i]);}
  }

  if($iso_counter == 0){print STDERR "\n!!!!\nWARNING\n!!!!\nI did not found any matching *_additional* files as expected by --isoform=uniprot, please check your input data!\nPlease provide the *_additional* files (e.g. if you provide 'UP000000437_7955.fasta', then simply also provide 'UP000000437_7955_additional.fasta').\n";}
  @files=@new_files;
}

# Always do
if($step < 3 || $step==4){ # don't check for step=3=clustering (not necessary) and if step=2, then files are allready checked with step=1
  &check_files;   # Check files, count sequences
}
@files = ();
foreach my $file (sort { if ($gene_counter{$a} == $gene_counter{$b}) {$a cmp $b;} else {$gene_counter{$b} <=> $gene_counter{$a};} } keys %gene_counter) {push(@files,$file);} # Biggest first # Alphabet otherwise 5.16

# 6.0.32 check if there are _clean files available -> use it instead. 
if($step < 3 || $step==4){ $_ = search_clean_file($_) for @files; @files = uniq @files }

if($verbose && $isoform ne ""){print STDERR "found ".(scalar keys %isoform_mapping)." isoforms in total.\n";}
if(scalar keys %isoform_mapping == 0 && $isoform ne ""){print STDERR "\n!!!!\nWARNING\n!!!!\n I did not found any isoforms as expected from the --isoform= option, please check your input data!\nProceeding anyway...\n";}

# Step 1, check files and make indices
if ($step == 0 || $step == 1) {
  if($verbose){print STDERR "\n$GREEN**Step 1**$NC\n";}

  #$do_log=1;
  #$_->kill('KILL')->detach foreach threads->list(); 
  #threads->create('log_Worker');
  #foreach my $file (@files) { $step1_QUEUE->enqueue($file); }
  #if($debug){print STDERR "Creating ".(scalar @files > $cpus+10 || $debug ? $cpus : 1)." generate_indices_Worker (cpus=$cpus) for ".$step1_QUEUE->pending." jobs\n"}
  #if($debug){print STDERR "currently running ".(threads->list())." threads (pre spawn)\n";}
  #threads->create('generate_indices_Worker');
  
  # !! INFO: !!
  # here we use only one thread to generate the db files !
  # -> diamond will fail if multiple makedb instances are running in parallel, 
  #    prducing un-joinable running threads (stuck within the `$cmd 2>&1` in the generate_indices)
  # approx 1 in 10 runs fail with a infinte loop see issue #69 (https://gitlab.com/paulklemm_PHD/proteinortho/-/issues/69)
  # work-around in 6.1.4
  
  # code for multiple threads (does not work sometimes for diamond): 
  #for (my $i = 0; $i < ((scalar @files > 500 && scalar @files > $cpus+10) || $debug ? $cpus : 1) ; $i++) { my $tmp = threads->create('generate_indices_Worker') } # spawn a thread for each core
  #if($debug){print STDERR "currently running ".(threads->list())." threads (post spawn)\n";}
  #$_->join() foreach threads->list(); 

  foreach my $file (@files){
    &generate_indices($file);
  }; # Generate index files for blast
  if ($desc) {
    &write_descriptions;  # Write sequence description file
  }
}

# start the daemon
if($debug==99){
  my $updateIntervalInSeconds=1;
  system("pstree >/dev/null 2>&1");
  my $masterpid = getppid;
  if($? == 0){
    if ((my $pid = fork) == 0) { # make a fork of the current process 
      # this is now executed in parallel !
      my $ppid = getppid; # parent pid 
      while (1) {
        sleep $updateIntervalInSeconds;
        unless (kill 0, $ppid) { # check if parent is still alive (kill 0 = no kill is send but checked for errors) 
            exit;
        } else {
            print STDERR "ps --forest -o pid,tty,stat,time,cmd -g $masterpid\n".`ps --forest -o pid,tty,stat,time,cmd -g $masterpid`."\n";
        }
      }
    }
  }
}


# Step 2, run blast and synteny algorithm
if ($step == 0 || $step == 2) {

  # check if database exists for inputs:
  foreach my $file (keys %gene_counter) {
    if($blastmode !~ m/blat/ && `ls '${file}'.${blastmode}* 2>/dev/null` ne ""){ # test here
      my $a = "$tmp_path/DB/".basename($file);
      if(`ls '${a}'.${blastmode}* 1>/dev/null 2>&1` ne ""){                           # or here
        &Error("I did not find the database for ".$_[0]." (".$a."). Did you run --step=1 and maybe removed the databases? Please rerun 'proteinortho --step=1 --force /path/to/fastas' such that the databases can be recreated and then proceed with -step=2 and -step=3.");
      }
    }
  }# ok all db files exists

  if($verbose){print STDERR "\n$GREEN**Step 2**$NC using $blastmode ";
  if($verbose && ($synteny || $selfblast)){print STDERR "with :";}
  if($verbose && $synteny){print STDERR " synteny";}
  if($verbose && $selfblast){print STDERR " selfblast";}
  print STDERR "\n"; }

  &init_graph_output; # Initiate Output file(s)
  &run_blast;   # Run blasts
}

undef %max_gene_length_diamond;
%max_gene_length_diamond=();

# Check for duplicated edges if --unique or --checkblast
if($checkblast == 1){

  if($verbose) {print STDERR "\n$GREEN**Step 2.5 Checking blast graph(s)**$NC\n"};

  if($synteny){
    unlink($syngraph."_clean");
    system("$po_path/proteinortho_cleanupblastgraph $syngraph"."* >tmp_$syngraph 2>$tmp_path/proteinortho_cleanupblastgraph.out && mv tmp_$syngraph $syngraph"."_clean");
    
    my $wcl_clean = `grep "removed" $tmp_path/proteinortho_cleanupblastgraph.out | wc -l`;
    chomp($wcl_clean);
    $wcl_clean =~ s/[ \t]+.*$//;

    if($wcl_clean > 0){
      $syngraph.="_clean";
      if($verbose) {print STDERR "A clean version without any duplicated edges of $syngraph is saved to $syngraph"."_clean\n";}
    }else{
      if($verbose) {print STDERR "$syngraph is free of duplicated edges\n";}
      unlink($syngraph."_clean");
    }
  }else{
    unlink($simgraph."_clean");
    system("$po_path/proteinortho_cleanupblastgraph $simgraph"."* >tmp_$simgraph 2>$tmp_path/proteinortho_cleanupblastgraph.out && mv tmp_$simgraph $simgraph"."_clean");
    
    my $wcl_clean = `grep "removed" $tmp_path/proteinortho_cleanupblastgraph.out | wc -l`;
    chomp($wcl_clean);
    $wcl_clean =~ s/[ \t]+.*$//;

    if($wcl_clean > 0){
      $simgraph.="_clean";
      if($verbose) {print STDERR "A clean version without any duplicated edges of $simgraph is saved to $simgraph"."_clean\n";}
    }else{
      if($verbose) {print STDERR "$simgraph is free of duplicated edges\n";}
      unlink($simgraph."_clean");
    }
  }
}

# Step 3, spectral clustering
if ($step == 0 || $step == 3) {
  if($verbose){print STDERR "\n$GREEN**Step 3**$NC\n";}
  &cluster;             # form clusters and write outputs
}

# Step 4
our $hmm_QUEUE = Thread::Queue->new();    # A new empty queue
our $hmm_QUEUE_maxNumEl=0;
our $hmm_proteinorthofile = $synteny ? $syntable : $simtable;
our $hmm_header="";
our @hmm_filenames;
our %hmm_filenames2colid;
our $hmm_num_cols = 0;

if ($step == 4 && $debug) {  
  if($verbose){print STDERR "\n$GREEN**Step 4 (experimental feature)**$NC\n";}
  &hmmenriched;             
}

if ($clean) {&clean;}           # remove blast indices

if ($nograph) {
  unlink($simgraph);
  if ($synteny) {unlink($syngraph);}
}

#clean up tmp dir
if (!$keep && $tmp_path =~ m/\/proteinortho_cache_[^\/]+\d*\/$/ && $step!=1){system("rm -r $tmp_path >/dev/null 2>&1");}

foreach my $file (@files_cleanup){unlink($file);}

if($verbose){print STDERR "$GREEN"."All finished.$NC\n";}

##########################################################################################
# Functions
##########################################################################################

sub clean {
  if($verbose){ print STDERR "Removing temporary files...\n" }
  foreach my $file (@files) {
    system("rm $file.$blastmode"."*");
  }
}

sub workertest {
  local $SIG{KILL} = sub { threads->exit };
  my $tid = threads->tid();

  while(my $job = $hmm_QUEUE->dequeue()){
    print STDERR "$tid - -> start\n";
    system("sleep 2");
    print STDERR "$tid -> end\n";
  }
}

sub hmmenriched {

  if(!-e "$tmp_path/proteinortho_grab_proteins.$hmm_proteinorthofile.done"){
    system("proteinortho_grab_proteins.pl -cpus=$cpus -minprots=$hmm_minprots -ignoreWarning -tofiles='$tmp_path/' $hmm_proteinorthofile ".join(" ",@files)."; touch $tmp_path/proteinortho_grab_proteins.$hmm_proteinorthofile.done");
    if ($? != 0) { &Error("'proteinortho_grab_proteins.pl' failed with code $?.$NC (Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Code).") }
  }

  my %orthogroupfiles;
  foreach my $fp (glob("$tmp_path/*OrthoGroup*.fasta")) { if($fp=~m/OrthoGroup([0-9]+)\.fasta/){ $orthogroupfiles{$1}=$fp }}

  my $i = -1;
  open(my $FH,"<$hmm_proteinorthofile");
  while(<$FH>){
    chomp;
    my $line=$_;
    if(/^#/){
      #print RESULTFH "$line\n";
      $hmm_header .= $line;
      @hmm_filenames = split("\t",$line);
      for (my $i = 0; $i < scalar @hmm_filenames; $i++) { 
        my $base_file = $hmm_filenames[$i];
        $base_file=~s/\.fasta|\.faa|\.fna//g;
        $hmm_filenames2colid{$base_file}=$i 
      }
      next
    }

    $i=$i+1;
    my @a=split("\t",$line);

    if($hmm_num_cols==0){ $hmm_num_cols=scalar @a }
    
    my $groupfile = "";
    if( exists $orthogroupfiles{$i} && $a[1] >= $hmm_minprots ){ $groupfile = basename($orthogroupfiles{$i}); }
    $hmm_QUEUE->enqueue($groupfile."$;$line$;$i");
  }
  close($FH);

  for (my $i = 0; $i < $cpus; $i++) { $hmm_QUEUE->enqueue(undef) }
  $hmm_QUEUE_maxNumEl = $hmm_QUEUE->pending;

  for (my $i = 0; $i < $cpus; $i++) { threads->create('hmmWorker') } # spawn a thread for each core

  system("head -n1 $hmm_proteinorthofile >$hmm_proteinorthofile.enriched");

  open(my $FH_RESULT,">>$hmm_proteinorthofile.enriched");
  my %hmm_has_protein;
  foreach (threads->list()) {
    $_->join(); # await all threads
    my $tid=$_->tid();
    open(my $FH,"<$tmp_path/tid.$tid.out");
    while(<$FH>){
      chomp;
      my $line=$_;
      my @a = split("\t",$line);
      my $species=0;
      my $prots=0;
      for (my $j = 3; $j < scalar @a; $j++) {
        my @newel;
        foreach (split(",",$a[$j])){
          if(!exists $hmm_has_protein{$_}){
            $hmm_has_protein{$_}=();
            push(@newel,$_);
          }
        }
        $a[$j] = join(",",@newel);
        $prots += scalar @newel;
        if($a[$j] eq ""){$a[$j]="*"}else{$species++}
      }
      $a[0]=$species;
      $a[1]=$prots;
      if($species>0){print $FH_RESULT join("\t",@a)."\n"}      
    }
    close($FH);
  }
  close($FH_RESULT);
}

sub hmmWorker {
  local $SIG{KILL} = sub { threads->exit };

  my $tid = threads->tid();
  my %has_protein;

  open(my $FHret,">$tmp_path/tid.$tid.out");
  while(defined(my $job = $hmm_QUEUE->dequeue())){

    $job = "$tmp_path/$job";
    if ($verbose){ print STDERR "\r[tid=$tid] -step=4 in progress : ".(int(10000*(1-$hmm_QUEUE->pending/$hmm_QUEUE_maxNumEl))/100)."%" }

    my ($groupfile,$line,$i) = split("$;",$job);
    my @tmp_hmm_cols = split("\t",$line);
    if($tmp_hmm_cols[1] < $hmm_minprots){ print $FHret join("\t",@tmp_hmm_cols)."\n"; next }

    @has_protein{ split(",",join(",", @tmp_hmm_cols[3 .. $#tmp_hmm_cols ])) }=();

    if($verbose>1) {print STDERR "aligning grouped sequences\n";}

    if(!-e "$groupfile.aln"){
      system("mafft --anysymbol --auto --thread 1 $groupfile 2>/dev/null | MSAtrimmer.pl -fasta -g='90\%' -norealn - > $groupfile.aln 2>/dev/null");
      if ($? != 0) { &Error("'mafft --anysymbol --auto --thread 1 $groupfile | MSAtrimmer.pl -fasta -g='90\%' -norealn - > $groupfile.aln' ($groupfile) failed with code $?.$NC (Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Code).") }
    }

    if($verbose>1) {print STDERR "build hmm database\n";}

    if(!-e "$groupfile.aln.hmm"){
      system("hmmbuild $groupfile.aln.hmm $groupfile.aln >/dev/null 2>/dev/null");
      if ($? != 0) { &Error("'hmmbuild $groupfile.aln.hmm $groupfile.aln' ($groupfile) failed with code $?.$NC (Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Code).") }
    }

    if($verbose>1) {print STDERR "search hmm database against input sequences\n";}

    my $newprots=0;
    my $newspecies=0;

    foreach my $file (@files){
      my $base_file = basename($file);
      $base_file=~s/\.fasta|\.faa|\.fna//g;
      if(!exists $hmm_filenames2colid{$base_file}){print STDERR "$base_file invalid"; die;}
      my $coli = $hmm_filenames2colid{$base_file};
      
      my $hmm_ret=`hmmsearch --cpu 1 -E $hmm_search_evalue $groupfile.aln.hmm $file 2>/dev/null | grep '\>\>'`;# |  2>/dev/null

      foreach (split("\n",$hmm_ret)) {
        $_=~s/^>> ?//;$_=~s/[ \t].*$//; chomp;
        if(!exists $has_protein{$_}){
          if( $tmp_hmm_cols[$coli] eq "" || $tmp_hmm_cols[$coli] eq "*" ){ $newspecies++;$tmp_hmm_cols[$coli]="" }else{ $_=",$_" }
          $tmp_hmm_cols[$coli] .= $_; 
          $newprots++;
        }
      }
    }

    if($verbose>1) {print STDERR "$tid evaluating results\n";}
    
    if($newprots>0){
      $tmp_hmm_cols[0] += $newspecies;
      $tmp_hmm_cols[1] += $newprots;
      $tmp_hmm_cols[2] =  "NA";
    }
    if($tmp_hmm_cols[1]>=2 || ($singles && $tmp_hmm_cols[1]>=1)){ 
      print $FHret join("\t",@tmp_hmm_cols)."\n";
    }
  }
  close($FHret);

  threads->exit;
}


sub cluster {

  if(!$useMcl){

    # run the free command to calculate the free memory (in MB)
    if($freemem_inMB == -1){
      $freemem_inMB = `free -m 2>/dev/null`;
        if(defined($freemem_inMB) && $freemem_inMB){
        my @freemem_arr = split("\n",$freemem_inMB);
        $freemem_inMB=2048; #default value 2GiB
        if(scalar @freemem_arr>0){
          my @freemem_arr2 = split(" ",$freemem_arr[1]); #first row after header = Mem
          my $ci = 0;
          foreach (@freemem_arr2) {
            if($_ ne ""){
              if($ci eq 1){ # first column = total
                $freemem_inMB=$_*0.75;
                if($verbose){print STDERR "Clustering by similarity (Proteinortho mode) using $cpus cpu core(s). Please wait...\n";}
                last;
              }
              $ci=$ci+1;
            }
          }
        }
      }else{
        $freemem_inMB=16384;
        if($verbose){print STDERR "Clustering by similarity (Proteinortho mode) using up to $freemem_inMB MB of memory (default value, command 'free' not found) and $cpus cpu core(s). Please wait...\n";}
      }
    }else{
      if($verbose){print STDERR "Clustering by similarity (Proteinortho mode) using up to $freemem_inMB MB of memory and $cpus cpu core(s). Adjust this behaviour with the -mem option. Please wait...\n";}
    }
    #if($freemem_inMB < 5){if($verbose){print STDERR "$ORANGE"."[WARNING]$NC Increased the maximal memory threshold to 5 MB ($freemem_inMB MB is not sufficent).$NC\n";}$freemem_inMB=5;} # minimum 5 MB
    if($freemem_inMB < 1000){if($verbose){print STDERR "$ORANGE"."[WARNING]$NC The memory threshold of $freemem_inMB MB seems somewhat small. I will increase the threshold to 1 GB (-ram=1000).$NC\n";}$freemem_inMB=1024;} # minimum 5 MB
      }else{
    if($verbose){print STDERR "$ORANGE"."[WARNING]$NC Using MCL for clustering (-ram has no effect). Please wait...$NC\n";} # minimum 5 MB
  }

  if(!$useMcl){
    my $cluster_verbose_level = "";
    if($verbose <= 1){ $cluster_verbose_level = "-verbose 1 "; }
    if($verbose == 2 || $debug>0){ $cluster_verbose_level = "-verbose 1 -debug $debug "; }

    #$ENV{'OMP_NUM_THREADS'}=1;
    #$ENV{'OMP_PROC_BIND'}=$ompprocbind;
    #-purity $purity 
    my $cmd="OMP_NUM_THREADS=1 $po_path/proteinortho_clustering ".($core?"-core -coreMinSpecies $coreMinSpecies -coreMaxProts $coreMaxProts ":"")."$cluster_verbose_level -minspecies $minspecies -cpus $cpus -weighted 1 -conn $connectivity ".($clusterOptions ne "" ? "$clusterOptions" : "" )." -rmgraph '$rm_simgraph' '$simgraph'* >'$simtable' ".($verbose >= 1 ? "" : "2>/dev/null");

    if($debug>0){ print STDERR "$cmd\n"; }

    system($cmd);
    if ($? != 0) {
      &Error("'proteinortho_clustering' failed with code $?.$NC (Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Code)\nMaybe your operating system does not support the statically compiled version, please try recompiling proteinortho with 'make clean' and 'make' (and 'make install PREFIX=...').");
    }
  }else{
    my $stderrout = `$po_path/proteinortho_do_mcl.pl $cpus $simgraph*`;
    if ($? != 0) {
      print STDERR $stderrout;
      &Error("proteinortho_do_mcl.pl failed with code $?.$NC (Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Code)\nDo you have a woring mcl version installed? (Try 'mcl' in the terminal) If not please install with e.g. 'apt-get install mcl'.");
    }
    system("mv mcl.proteinortho $simtable");
    system("mv mcl.proteinortho-graph $csimgraph");
  }

  if($verbose){print STDERR "\n[OUTPUT] -> Orthologous groups are written to $simtable\n";}
  if(scalar @files < 10){
    if($verbose){print STDERR "You can extract the fasta files of each orthology group with\nproteinortho_grab_proteins.pl ".($isoform ne "" ? "--isoform " : "")."-tofiles $simtable ".(scalar @files == 0 ? "INPUT FASTA FILES" : "'".join("' '",@files)."'")."\n (Careful: This will generate a file foreach line in the file $simtable).\n";}
  }

  system("(head -n 1 '$simtable' && tail -n +2 '$simtable' | LC_ALL=C sort -k1,1nr -k2,2nr -k3,3nr ) > '$simtable.sort'; mv '$simtable.sort' '$simtable'");
  
  if ($singles) {
    if($verbose){print STDERR "Adding singles...\n";}
    my $fastas = "'".join("' '",@files)."'";
    system("$po_path/proteinortho_singletons.pl $fastas <'$simtable' >>'$simtable'");
  }

  if (!$nograph && !$useMcl) {
    system("$po_path/proteinortho_graphMinusRemovegraph '$rm_simgraph' $simgraph* >'$csimgraph'");
    unless ($keep) {unlink($rm_simgraph);}
    if($verbose){print STDERR "[OUTPUT] -> Orthologous pairs are written to $csimgraph\n";}
  }elsif($useMcl){
    if($verbose){print STDERR "[OUTPUT] -> Orthologous pairs are written to $csimgraph\n";}
  }

  if(!$nograph && (-x "$po_path/proteinortho_summary.pl")){
    system("$po_path/proteinortho_summary.pl '$csimgraph' >'$csimgraph.summary' 2>/dev/null");
    if($verbose){
      system('perl -lne \'if($i++>30){print "...\n";exit(0)}if(/^# ----/){exit(0)}print $_\' '.$csimgraph.'.summary >&2');
      print STDERR "[OUTPUT] -> Summary is written to $csimgraph.summary\n";
    }
  }   


  if(scalar @files < 10){
    system("perl $po_path/proteinortho2html.pl $simtable '".join("' '",@files)."' >$simtablehtml");
    if($verbose){print STDERR "\n[OUTPUT] -> Orthologous groups are written to $simtablehtml\n";}
  }else{
    if($verbose){print STDERR "[OUTPUT] -> You can extract a html version of the output using:\nproteinortho2html.pl $simtable [PLACE FASTA FILES HERE] >$simtablehtml\n\n";}
  }

  if ($doxml) {
    system("perl $po_path/proteinortho2xml.pl $simtable >$simtable.xml");
    if ($? != 0) {system("rm $simtable.xml"); &Error("proteinortho2xml.pl failed");}
    if($verbose){print STDERR "[OUTPUT] -> Generated the OrthoXML (v0.3) file $simtable.xml";}
  }else{
    if($verbose){print STDERR "[OUTPUT] -> You can extract an OrthoXML (v0.3) version of the orthology group output using:\nproteinortho2xml.pl $simtable >$simtable.xml\n\n";}
  }

  if ($synteny) {
    if($verbose){print STDERR "\n"."Clustering by gene-order ($GREEN"."POFF mode$NC)\n";}
    if(!$useMcl){

      my $cluster_verbose_level = "";
      if($verbose <= 1){ $cluster_verbose_level = "-verbose 1 "; }
      if($verbose == 2 || $debug>0){ $cluster_verbose_level = "-debug 1 -debug $debug "; }

      #$ENV{'OMP_NUM_THREADS'}=1;
      #$ENV{'OMP_PROC_BIND'}=$ompprocbind;
      #-purity $purity 
      system ("OMP_NUM_THREADS=1 $po_path/proteinortho_clustering ".($core?"-core -coreMinSpecies $coreMinSpecies -coreMaxProts $coreMaxProts ":"")."$cluster_verbose_level ".($clusterOptions ne "" ? "$clusterOptions" : "" )." -minspecies $minspecies -cpus $cpus -weighted 1 -conn $connectivity -rmgraph '$rm_syngraph' '$syngraph'* >'$syntable' ".($verbose >= 1 ? "" : "2>/dev/null"));
      
      if ($? != 0) {
        &Error("proteinortho_clustering failed with code $?.$NC (Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Code)\nDid you use a static version? Maybe your operating system does not support the static compiled version, please recompile 'make clean' and 'make' or 'make USEPRECOMPILEDLAPACK=FALSE'.");
      }
    }else{
      system($po_path.'/proteinortho_do_mcl.pl '.$cpus.' '.$syngraph.'*');
      system("mv mcl.proteinortho $syntable");
      system("mv mcl.proteinortho-graph $csyngraph");
    }

    system("(head -n 1 '$syntable' && tail -n +2 '$syntable' | LC_ALL=C sort -k1,1nr -k2,2nr -k3,3nr ) > '$syntable.sort'; mv '$syntable.sort' '$syntable'");

    if($verbose){print STDERR "\n[OUTPUT] -> Orthologous groups are written to $syntable\nYou can extract the fasta files of each orthology group with\nproteinortho_grab_proteins.pl ".($isoform ne "" ? "--isoform " : "")."-tofiles $syntable ".(scalar @files == 0 ? "INPUT FASTA FILES" : "'".join("' '",@files)."'")."\n(Careful: This will generate a file foreach line in the file $syntable).\n";}

    if ($singles) {
      if($verbose){print STDERR "Adding singles...\n";}
      my $fastas = "'".join("' '",@files)."'";
      system("$po_path/proteinortho_singletons.pl $fastas <'$syntable' >>'$syntable'");
    }

    if (!$nograph && !$useMcl) {
      system("$po_path/proteinortho_graphMinusRemovegraph '$rm_syngraph' $syngraph* >'$csyngraph'");
      unless ($keep) {unlink($rm_syngraph);}
      if($verbose){print STDERR "[OUTPUT] -> Orthologous pairs written to $csyngraph\n";}
    }elsif($useMcl){
      if($verbose){print STDERR "[OUTPUT] -> Orthologous pairs are written to $csyngraph\n";}
    }
    if(!$nograph && (-x "$po_path/proteinortho_summary.pl")){
      system("$po_path/proteinortho_summary.pl '$csyngraph' >'$csyngraph.summary'");
      if($verbose){
        system('perl -lne \'if($i++>30){print "...\n";exit(0)}if(/^# ----/){exit(0)}print $_\' '.$csyngraph.'.summary >&2');
        print STDERR "[OUTPUT] -> Summary is written to $csyngraph.summary\n";
      }
    }
    
    system("perl $po_path/proteinortho2html.pl $syntable '".join("' '",@files)."' >$syntablehtml");
    if($verbose){print STDERR "[OUTPUT] -> HTML version of orthologous groups are written to $syntablehtml\n";}

    if ($doxml) {
      system("perl $po_path/proteinortho2xml.pl $syntable >$syntable.xml");
      if ($? != 0) {system("rm $syntable.xml"); &Error("$ORANGE\n[WARNING]$NC -> proteinortho2xml.pl failed");}
      if($verbose){print STDERR "[OUTPUT] -> Generated the OrthoXML (v0.3) file $syntable.xml";}
    }else{
      if($verbose){print STDERR "[OUTPUT] -> You can extract an OrthoXML (v0.3) version of the orthology group output using:\nproteinortho2xml.pl $simtable >$simtable.xml\n\n";}
    }
  }
}

sub print_header {
  if($verbose){print STDERR "*****************************************************************\n";
  print STDERR "$GREEN"."Proteinortho$NC with PoFF version $version - An orthology detection tool\n",
  "*****************************************************************\n";}
}




sub print_usage_more {
print STDERR "
     |
  ${BLUE}  /${NC} ${RED2}\\  $NC
  ${BLUE} /\\${NC}${RED2} /\\ $NC
  ${BLUE}/ ${RED2}/${BLUE} \\ ${RED2}\\${NC}

Usage: proteinortho6.pl [OPTIONS] FASTA1 FASTA2 [FASTA...] (one for each species, at least 2)
Options:
         [General options]
         -project=    prefix for all result file names [default: myproject]
         -inproject=  load data from this namespace instead (works with intermediate files for step=2 and blast-graph for step=3)
         -cpus=       number of processors to use [default: auto]
         -threads_per_process= number of threads per process [default: $threads_per_process], 
                      e.g. using -cpus=4 and -threads_per_process=2 will spawn 4 workerthreads using 2 cpu cores each = total of 8 cores
         -silent      sets verbose to 0
         -temp=       path for temporary files [default: working directory]
         -keep        stores temporary blast results for reuse (proteinortho_cache_project directory). 
                      In a second run the intermediate blast results are loaded instead of calculated.
                      You can adjust the parameters e.g. a more strict -e evalue cut off and write the output to a different namespace using --inproject.
         -force       forces the recalculation of the blast results in any case in step=2. Also forces the recreation of the database generation in step=1
         -clean       remove all unnecessary files after processing
         -step=       1 -> generate indices
                      2 -> run blast (and ff-adj, if -synteny is set)
                      3 -> clustering
                      0 -> all (default)
         -isoform=    Enables the isoform merging: All isoforms are united to a single entity and treated as one protein in the RBH algorithm. 
                      E.g. Let I1 and I2 two isoforms, I1 reciprocally matches A and I2 B but no other hits are found: with this option the output is A-I-B (results of I1 and I2 are merged). 
                      Note: Extracting a group with an isoform will result in all isoforms (proteinortho_grab_proteins.pl -isoform).
                      ncbi -> if the word 'isoform' is found 
                      uniprot -> 'Isoform of XYZ' (You need to add the *_additional.fasta files to the analysis)
                      trinity -> using '_iX' suffix
                      For more information have a look at: 
                        https://gitlab.com/paulklemm_PHD/proteinortho/-/wikis/FAQ#how-does-the-isoform-work

         [Search options]
         -p=          blast program [default: diamond]

                      {blastp|blastn|tblastx|blastp_legacy|blastn_legacy|tblastx_legacy|diamond|usearch|ublast|lastp|lastn|rapsearch|topaz|blatp|blatn|mmseqsp|mmseqsn}

                      The program need to be installed first.
                      A suffix 'p' or 'n' indicates aminoacid fasta files (p) or nucleotide fasta files (n).
                      The '_legacy' suffix indicates legacy blastall (otherwise blast+ is used).

                        autoblast : automatically detects the blast+ program (blastp,blastn,tblastn,blastx) depending on the input (can also be mixed together!)
                        blast*|tblastx : standard blast+ family (blastp : protein files, blastn : dna files)
                        blast*_legacy : legacy blast family (blastall)
                        diamond : Only for protein files! standard diamond procedure and for genes/proteins of length >40 with the additional --sensitive flag
                        usearch : usearch_local procedure with -id 0 (minimum identity percentage).
                        ublast : usearch_ublast procedure.
                        lastn : standard lastal. Only for dna files!
                        lastp : lastal using -p and BLOSUM62 scoring matrix. Only for protein files!
                        rapsearch : Only for protein files!
                        topaz : Only for protein files!
                        blat* : Blat family. blatp : For protein files! blatn : For dna files! blatx : For dna files!
                        mmseqs* : mmseqs family. mmseqsp : For protein files! mmseqsn : For dna files! blatx : For dna files!
         -checkfasta  Checks if the given fasta files are compatible with the algorithm of -p
         -e=          E-value for blast [default: 1e-05]
                      (column 11 of blast outfmt 6 output)
         -selfblast   apply selfblast, detects paralogs without orthologs
         -sim=        min. reciprocal similarity for additional hits (0..1) [default: 0.95]
                      1 : only the best reciprocal hits are reported
                      0 : all possible reciprocal blast matches (within boundaries of -e, -cov, ...) are reported
         -identity=   min. percent identity of best blast hits [default: 25]
                      (column 3 (pident) of blast outfmt 6 output)
         -cov=        min. coverage of best blast alignments in \% [default: 50]
                      coverage between protein A and B = min ( alignment_length_A_B/length_A, alignment_length_A_B/length_B )
                      (alignment_length_A_B = column 4 of blast outfmt 6 output)
         -subparaBlast=    additional parameters for the search tool
                      example -subparaBlast='-seg no' or -subparaBlast='--more-sensitive' for diamond
         -subparaMakeBlast=    additional parameters for the database generation
         -identical   only return entries that are 100\% identical
         -range=      maximal length difference for any blast hit. e.g. 0 = filter for hits between proteins of same length [default:disabled]

         [Synteny options]
         -synteny     activate PoFF extension to separate similar sequences print
                      by contextual adjacencies (requires .gff for each .fasta)
                      Each protein with the ID 'XXX' needs to be refered in the gff file with the attribute 'Name=XXX'
                      For more information see 'https://gitlab.com/paulklemm_PHD/proteinortho#poff'
         -dups=       PoFF: number of reiterations for adjacencies heuristic,
                      to determine duplicated regions (default: 0)
         -cs=         PoFF: Size of a maximum common substring (MCS) for
                      adjacency matches (default: 3)
         -alpha=      PoFF: weight of adjacencies vs. sequence similarity
                      (default: 0.5)

         [Clustering options]
         -singles     report singleton genes without any hit
         -conn=       min. algebraic connectivity [default: 0.1]
                      This is the main parameter for the clustering step. 
                      Choose larger values then more splits are done, resulting in more and smaller clusters. 
                      (There are still cluster with an alg. conn. below this given threshold allowed if the protein to species ratio is good enough, see -minspecies option below)
         -xml         produces an OrthoXML formatted file of the *.proteinortho.tsv file.
         -nograph     do not generate .proteinortho-graph file (pairwise orthology relations)
         -core        stop clustering if a split would result in groups that do not span across all species of the inital connected component. Overrules the -conn threshold.

         [Misc options]
         -desc        write description files (for NCBI FASTA input only)
         -debug       gives detailed information for bug tracking
         -binpath=    path to your directory of local programs that are not available globally (this should not be needed)

         [Large compute jobs]
          In case jobs should be distributed onto several machines, use
         -jobs=M/N    N defines the number of defined job groups (e.g. PCs)
                      M defines the set of jobs to run in this process

         Example:     run with 
                        -step=1
                      to prepare data then to split a run on two machines use
                        -jobs=1/2 -step=2 on PC one and
                        -jobs=2/2 -step=2 on PC two
                      finally run with
                        -step=3 to finalize

For more information see the man page: 'proteinortho -man' or online: https://gitlab.com/paulklemm_PHD/proteinortho
Or you can use the GUI proteinorthoHelper.html (available at http://lechnerlab.de/proteinortho/)
Do you have suggestions or need more help: write a mail to lechner\@staff.uni-marburg.de.

";
}
# -mcl         enables the mcl algorithm for clustering instead of power iteration or lapacks routine. (needs mcl to be installed) 
# DEPRECATED -ram=        maximal used ram threshold for LAPACK and the input graph in MB [default: 75% of the free memory]
# -memory=     the amount of ram used for partioning in MB [default 75% of the total memory]
# -purity=     avoid spurious graph assignments, the lower the purity threshold the more uncertain edges are cut [>0, default: choose value automatically]

sub init_graph_output {
# if (-e $graph) {
#   &Error("Graph output file '$graph' already exists.");
# }
  open(GRAPH,">$simgraph") || &Error("Could not open graph '$simgraph': $!");
  print GRAPH "# file_a\tfile_b\n# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba\n";
  close(GRAPH);

  unless ($synteny) {return;}
# if (-e $syn) {
#   &Error("Synteny Graph output file '$syn' already exists.");
# }
  open(SYN,">$syngraph") || &Error("Could not open graph '$syngraph': $!");
  print SYN "# file_a\tfile_b\n# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba\tsame_strand\tsimscore\n";
  close(SYN);
}

sub set_threads_per_process {
  lock($jobs_done);
  my $willdo = ($jobs_todo-$jobs_done+$_[0]);

  if ($debug) {
    print STDERR "\nTODO: $jobs_todo DONE: $jobs_done Running: $_[0] -> $willdo\n";
  }

  if ($willdo < 1) {return;}

  my $optimal = int($cpus/$willdo);
  lock($threads_per_process);
  if ($optimal > $threads_per_process) {
    $threads_per_process = $optimal;
    if ($debug) {
      print STDERR "\nBlast subthreads was set to $threads_per_process\n";
    }
  }
}

sub run_blast {
  # Jobs todo
  $jobs_todo = 0;
  for (my $i = 0; $i < scalar(@files)-1+$selfblast; $i++) {$jobs_todo += scalar(@files)-$i-1+$selfblast;}

  # Divide 5.16
  $part = int($jobs_todo/$split_to_X_jobs); # Round up to not miss anything
  # print STDERR "$jobs_todo/$split_to_X_jobs = $part\n";
  my $from = ($jobnumber-1)*($part)+1;
  if ($jobnumber == 1) {$from = 1;}
  my $to = ($jobnumber-1)*($part)+$part;
  if ($jobnumber == $split_to_X_jobs) {$to = $jobs_todo;}

  $part = 1+$to-$from;  # real part

  if ($split_to_X_jobs != -1 && $part < 1) {&Error("I have problems coordinating $split_to_X_jobs groups for $jobs_todo jobs.");}
  if ($split_to_X_jobs <= 1) {$from = -1; $to = -1; $split_to_X_jobs = -1; $part = -1;}

  &set_threads_per_process(0);  # Check if we can apply more threads, nothing is running so far
  &print_blast_stats();

  # Spawn worker threads

  for (my $i = 0; $i < $cpus; $i++) {threads->create('workerthread');}

  # For each file against each other file
  my $job_number = 0;
  SPEC: for (my $i = 0; $i < scalar(@files)-1+$selfblast; $i++) {
    for (my $j = $i+1-$selfblast; $j < scalar(@files); $j++) {
      $job_number++;
      if ($from != -1 && $job_number < $from) {next;}
      # Wait for queque to get empty (with some buffer)
      while ($jobque->pending() > 32 + 2*$cpus) {sleep(1);}
      # Syncronize with other processes
      $jobque->enqueue("$i $j");
      if ($to != -1 && $job_number >= $to) {last SPEC;}
    }
  }
  # Tell all threads they can stop
  {lock($all_jobs_submitted); $all_jobs_submitted = 1;}

  # Wait until all jobs are done
  foreach (threads->list())   {$_->join();}
  &print_blast_stats();   if($verbose){print STDERR "\n";}

  if($verbose){print STDERR "[OUTPUT] -> written to $simgraph\n";}
}

sub workerthread {

  my $thread_id = threads->tid();
  my $temp_file = "$tmp_path$project-$run_id-$thread_id";

  $temp_file=~s/^\.\///;

  # Clean up, just to be safe
  unlink("$temp_file.tmp");
  unlink("$temp_file.log");
  unlink("$temp_file.tmp.matching");

  my $last_jobs_time=time();

  while (1) {
    my ($i, $j);
    while (1) {
      # Fetch new jobs
      my $job = $jobque->dequeue_nb();
      # If there is nothing
      unless (defined($job)) {
        # Check if more jobs need to be done
        {
        lock($jobs_done);             #   Productive
        if ($jobs_done >= $jobs_todo || ($part != -1 && $jobs_done >= $part) || time()-$last_jobs_time > 3600) { #   Productive
          # lock($all_jobs_submitted);            #   DEBUGGING
          # if ($all_jobs_submitted) {            #   DEBUGGING
          if ($debug) {print STDERR "Thread $thread_id\tis leaving\n";}
          threads->exit;
        }}
        # If so, wait
        if ($debug) {print STDERR "Thread $thread_id\tis seeking work ($jobs_done / $jobs_todo)\n";}
        sleep(1);
      }
      else {
        $last_jobs_time=time();
        # Parse job
        ($i, $j) = split(" ",$job);
        # Break the fetch loop
        last;
      }
    }

    my $file_i = $files[$i];
    my $file_j = $files[$j];
    my $short_file_i = $file_i; $short_file_i =~ s/^.*\///;
    my $short_file_j = $file_j; $short_file_j =~ s/^.*\///;

    # Work
    &set_threads_per_process(scalar(threads->list()));
    my $result_ij = &blast($file_i,$file_j,$thread_id);

    my $result_ji;
    if ($file_i eq $file_j) {
      # One run is enough (selfblast)
      $result_ji = $result_ij;
    }
    else {
      $result_ji = &blast($file_j,$file_i,$thread_id);
    }

    if ($file_i eq $file_j && !$selfblast) {&Error("Selfblast is disabled but I want to check $file_i vs $file_j");}

    my %lengths;
    if ($file_i eq $file_j) {
      # Get lengths once is enough (selfblast)
      %lengths = %{&get_gene_lengths($file_i)};
    }
    else {
      %lengths = %{&get_gene_lengths($file_i,$file_j)};
    }
    my %reciprocal_matches = %{&match(\%lengths,$file_j,$file_i,$result_ij,$result_ji)};

    # Remove secondary hits if better exist (test here instead of later)
    %reciprocal_matches = %{&adaptive_best_blast_matches(\%reciprocal_matches)};

    if($selfblast && $file_i eq $file_j){
      my %seen_selfblast_edge;
      foreach (keys %reciprocal_matches) {
        my ($a,$b) = split(" ",$_);
        my $key = ($a lt $b ? "$a $b" : "$b $a" );
        if(!exists $seen_selfblast_edge{$key}){$seen_selfblast_edge{$key}=1}else{
          delete $reciprocal_matches{$_};
        }
      }
    }

    if ($synteny) {
      my ($ordered_matches, $track_pointer, $close_copies_pointer) = &synteny_matches(\%reciprocal_matches,$file_i,$file_j);
      open(PREGRAPH,">>$temp_file.tmp") || &Error("Could not open temp file '$temp_file.tmp': $!");
      print PREGRAPH $ordered_matches;
      close(PREGRAPH);
      my $ffadj_param = "-a $alpha";
      if ($duplication) { $ffadj_param .= " -R $duplication -M $cs";}
      my $cmd = "$po_path/proteinortho_ffadj_mcs.py $ffadj_param '$temp_file.tmp'";
      if ($debug) {print STDERR "$cmd\n";}
      my $synt_stats = qx($cmd);
      $synt_stats=~s/[\r\n]+$//;
      $synt_stats =~ s/#.+\n//;

      # Reverse mapping of full gene ids, two seperate maps in case of overlapps in short ids
      my %full_id_map_i;
      my %full_id_map_j;
      foreach (sort keys %reciprocal_matches) {
        my ($a, $b) = split(/\s/,$_);
        my $aa = &convertUniprotAndNCBI($a);
        my $bb = &convertUniprotAndNCBI($b);
        if ($aa ne $a) {
          if ($debug) {print STDERR "j_map: $aa -> $a\n";}
          $full_id_map_j{$aa} = $a;}
        if ($bb ne $b) {
          if ($debug) {print STDERR "i_map: $bb -> $b\n";}
          $full_id_map_i{$bb} = $b;}
      }

      # Reverse mapping of gene position to short id
      my %track = %{$track_pointer};
      my %close = %{$close_copies_pointer};
      # Generate hash for synteny hits
      my %synteny;

      unless (-s "$temp_file.tmp.matching") {
        print STDERR "\n$RED [Error] Failed to run $po_path/proteinortho_ffadj_mcs.py for\n$file_i vs $file_j\nMoving source to $temp_file.err for debugging\nI will continue, but results may be insufficient.$NC \n\n";
        system("mv $temp_file.tmp $temp_file.err");
        next;
      }

      open(OSYNGRAPH,"<$temp_file.tmp.matching") || &Error("Could not open temp file $temp_file.tmp.matching: $!'");
      while(<OSYNGRAPH>) {
          $_=~s/[\r\n]+$//;
          my ($i, $j, $score) = split(/\s+/,$_,3);
          if (!defined($score) || $i =~ /[^0-9]/ || $i == 0 || length($i) > 10) {next;}
          unless (defined($track{$file_i.$i})) {
            print STDERR "Could not find i: ".$file_i.$i."\n";  next;
          }
          unless (defined($track{$file_j.$j})) {
            print STDERR "Could not find j: ".$file_j.$j."\n";  next;
          }
          # Remap to full ID
          my $a = $track{$file_i.$i};
          my $b = $track{$file_j.$j};

          if (defined($full_id_map_i{$a})) {$a = $full_id_map_i{$a};}
          if (defined($full_id_map_j{$b})) {$b = $full_id_map_j{$b};}

          # Store
          $synteny{"$b $a"} = $score;

          # Close copies
          if ($neighbourjoin && defined($close{$i})) {
            my @partners = split(',',$close{$i});
            foreach (@partners) {
              my $c = $track{$file_i.$_};
              if (defined($full_id_map_i{$c})) {$c = $full_id_map_i{$c};}
              # Store
              $synteny{"$b $c"} = $score;
              if ($debug) {print STDERR "Storing addional proximity edge $a & $b -> $c\n";}
            }
          }
        }
        close(OSYNGRAPH);
        unlink("$temp_file.tmp");
        unlink("$temp_file.log");
        unlink("$temp_file.tmp.matching");

      {
      lock($syn_lock);

      open(SYN,">>$syngraph") || &Error("Could not open file '$syngraph': $!");
      print SYN "# $short_file_j\t$short_file_i\n";
      print SYN "# Scores: $synt_stats\n";
      foreach (sort keys %reciprocal_matches) {
        if (!defined($synteny{$_})) {if ($debug) {print STDERR "FAIL: $_\n";} next;}    # Not reported by gene-order algo
        my $line = "$_ ".$reciprocal_matches{$_}." $synteny{$_}";
        $line =~ s/ /\t/g;
        print SYN "$line\n";
      }
      close(SYN);
      }

    }

    {
      lock($graph_lock);

      my @arr0; #evalue ab
      my @arr1; #biscore ab
      my @arr2; #evalue ba
      my @arr3; #bitscore ba
      foreach (sort keys %reciprocal_matches) {
        my @spl=split(" ",$reciprocal_matches{$_});
        push(@arr0,$spl[0]);
        push(@arr1,$spl[1]);
        push(@arr2,$spl[2]);
        push(@arr3,$spl[3]);
      }

      open(GRAPH,">>$simgraph") || &Error("Could not open file '$simgraph': $!");
      print GRAPH "# $short_file_j\t$short_file_i\n# ".median(@arr0)."\t".median(@arr1)."\t".median(@arr2)."\t".median(@arr3)."\n";
      foreach (sort keys %reciprocal_matches) {
        my ($a,$b) = split(" ",$_);

        my $line = "$a $b ".$reciprocal_matches{$_};

        $line =~ s/ /\t/g;
        print GRAPH "$line\n";
      }
      close(GRAPH);
    }
    # Count
    {
      lock($jobs_done);
      $jobs_done++;
    }
    &print_blast_stats(); # Needs jobs_done to be free
  }
}

sub sum {
  my $ret = 0;
  foreach (@_){
    $ret += $_;
  }
  return $ret;
}
sub median {
  sum( ( sort { $a <=> $b } @_ )[ int( $#_/2 ), ceil( $#_/2 ) ] )/2;
}

sub identitybylength {
        # Accoding to the formula of Rost, 1999 (Twilight-zone)
  if($_[0] <= 11)   {return 100;}
  if($_[0] <= 450)  {return 480*$_[0]**(-0.32*(1+exp(-$_[0]/1000)));}
  return 19.5;
}

sub adaptive_best_blast_matches {
  my %reciprocal_matches = %{(shift)};

  if ($debug>1) {
    print STDERR "\nStart with ";
    print STDERR scalar(keys %reciprocal_matches);
    print STDERR " edges\n";
  }

  my %best;
  my %best_gen;
  # Gather best
  foreach (keys %reciprocal_matches) {
    my ($a,$b) = split(" ",$_);
    my ($evalue_ab,$bitscore_ab,$evalue_ba,$bitscore_ba) = split(" ",$reciprocal_matches{$_});
    if (!defined($best{$a}) || $best{$a} < $bitscore_ab) {
      $best{$a} = $bitscore_ab;
      $best_gen{$a} = $b;
    }
    if (!defined($best{$b}) || $best{$b} < $bitscore_ba) {
      $best{$b} = $bitscore_ba;
      $best_gen{$b} = $a;
    }
  }

  if ($debug>1) {
    my %gene_num;
    # Count gene number
    foreach (keys %reciprocal_matches) {
      my ($a,$b) = split(" ",$_);
      $gene_num{$a}++;
      $gene_num{$b}++;
    }
    print STDERR "Number of genes: ".scalar(keys %gene_num)."\n";

    foreach (keys %best) {
      print STDERR "Best score for $_ is $best{$_} ($best_gen{$_})\n";
    }
  }

  # Remove worse
  foreach (keys %reciprocal_matches) {
    my ($a,$b) = split(" ",$_);
    my ($evalue_ab,$bitscore_ab,$evalue_ba,$bitscore_ba) = split(" ",$reciprocal_matches{$_});
    if  ($best{$a}*$sim > $bitscore_ab) {delete $reciprocal_matches{$_}; if ($debug>1) {my $v = $bitscore_ab/$best{$a}; print STDERR "Removed $_ because $best{$a} vs $bitscore_ab ($v)\n";}}
    elsif   ($best{$b}*$sim > $bitscore_ba) {delete $reciprocal_matches{$_}; if ($debug>1) {my $v = $bitscore_ba/$best{$b}; print STDERR "Removed $_ because $best{$b} vs $bitscore_ba ($v)\n";}}
  }

  if ($debug>1) {
  print STDERR "\nEnd with ";
  print STDERR scalar(keys %reciprocal_matches);
  print STDERR " edges\n";

  my %gene_num;
  # Count gene number
  foreach (keys %reciprocal_matches) {
    my ($a,$b) = split(" ",$_);
    $gene_num{$a}++;
    $gene_num{$b}++;
  }
  print STDERR "Number of genes: ".scalar(keys %gene_num)."\n";}

  return \%reciprocal_matches;
}

sub synteny_matches {
  my %reciprocal_matches = %{(shift)};
  my $file_i = shift;
  my $file_j = shift;

  # Get order for both species (same hash as ids are non overlapping)
  my %order;
  my %track;
  for my $file ($file_i, $file_j) {
    # Get Coordinates for all genes
    my %coords = %{&read_details($file,1)};
    my $counter = 0;
    # Number them according to their order
    foreach my $id (sort
      {
        my @a = split("\t",$coords{$a});
        my @b = split("\t",$coords{$b});

#       #chr strand pos
#       if ($a[0] ne $b[0]) {return $a[0] cmp $b[0];}
#       if ($a[1] ne $b[1]) {return $a[1] cmp $b[1];}
#       return $a[2] <=> $b[2];

        #chr pos
        if ($a[0] ne $b[0]) {return $a[0] cmp $b[0];}
        return $a[2] <=> $b[2];
      } (keys(%coords))) {
      my @v = split("\t",$coords{$id});
      $order{$id} = ++$counter."\t".$v[1];  # Store strand info
      $track{$file.$counter} = $id;   # Reverse Mapping
    }
  }

  my $output = "";

  my @multis; # array that contains all multi-edges
  # Convert reciprocal matches to ffadj input
  foreach (keys %reciprocal_matches) {
    my @values = split(/\s/,$reciprocal_matches{$_});
    my ($a, $b) = split(/\s/,$_);
    unless (defined($order{$a})) {$a = &convertUniprotAndNCBI($a);}
    unless (defined($order{$b})) {$b = &convertUniprotAndNCBI($b);}
    my @a = split(/\s/,$order{$a});
    my @b = split(/\s/,$order{$b});
    unless (defined($a[0])) {&Error("Failed parsing gene IDs from blast output/gff input\n");}

    unless (defined($multis[$a[0]])) {$multis[$a[0]] = $b[0];}
    else         {$multis[$a[0]] .= ','.$b[0];}

    $output .= "$b[0]\t$a[0]\t";      # Positions
    if ($a[1] eq $b[1])   {$output .= "1\t";} # Same strand?
    else      {$output .= "-1\t";}
    my $score = (&edgeweight($values[0])+&edgeweight($values[2]))/2;  # Score made from e-values
    $output .= $score."\n";
  }

  # Check multis
  my %close_copies;
  if ($neighbourjoin) {
    for (my $i = 1; $i < scalar(@multis); $i++) {
      unless (defined($multis[$i])) {next;}
      my @partners = sort { $a <=> $b } split(',',$multis[$i]);
      if (scalar(@partners) <= 1) {next;}
      my $dist_limit = 2; # How far can tandem copies be away from each other? (0/1 = off, 2 = immediate, ...
      my $last = 999999999999999;
      foreach my $new (@partners) {
        if (abs($last-$new) < $dist_limit) {
          if (!defined($close_copies{$last}))   {$close_copies{$last} = $new;}
          else          {$close_copies{$last} .= ','.$new;}
          $close_copies{$new} = $last;    # The list is sortet, so we are here for the frist time
        }
        $last = $new;
      }
    }
  }

  return ($output, \%track, \%close_copies);
}

# Read the length of all genes in a given set of fasta files
sub get_gene_lengths {
  my %lengths;
  my $last_gene = "";
  my $lengths_lastgene=0;
  while (my $file = shift @_) {
    my $abs_path_file=abs_path "$file "; $abs_path_file=~s/ $//g;
    open(FASTA,"<$file") || &Error("Could not open '$file': $!");
    while (<FASTA>) {
      $_=~s/[\r\n]+$//;
      if ($_ =~ />/) {

        if($last_gene ne ""){
          if( (exists $lengths{$last_gene} && $lengths_lastgene > $lengths{$last_gene}) || !exists $lengths{$last_gene}){
            $lengths{$last_gene}=$lengths_lastgene;
            if($blastmode eq "autoblast" && $autoblast_fileis{$abs_path_file} eq "nucl"){
              $lengths{$last_gene}/=3;
            }
            $lengths{&convertUniprotAndNCBI($last_gene)}=$lengths{$last_gene};
          }
        }

        $_ =~ s/^>//; # remove the starting >
        $_ =~ s/\s.*//; # trim the description

        #$lengths{$_} = 0;
        $lengths_lastgene=0;
        $last_gene = $_;

      }
      else {
        #$lengths{$last_gene} += length($_);
        $lengths_lastgene += length($_);
      }
    }
    if($last_gene ne ""){
      if( (exists $lengths{$last_gene} && $lengths_lastgene > $lengths{$last_gene}) || !exists $lengths{$last_gene}){
        $lengths{$last_gene}=$lengths_lastgene;
        if($blastmode eq "autoblast" && $autoblast_fileis{$abs_path_file} eq "nucl"){
          $lengths{$last_gene}/=3;
        }
        $lengths{&convertUniprotAndNCBI($last_gene)}=$lengths{$last_gene};
      }
    }
    close(FASTA);
  }

  return \%lengths;
}

sub print_blast_stats {
  if (!$verbose) {return;}
  {
    if ($jobs_todo == 0) {&Error("Nothing to do. This should not happen! Please submit the FASTA file(s) and the parameter vector (above) to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com to help fixing this issue. ");}
    lock($jobs_done);
    my $percent = int($jobs_done/$jobs_todo*10000)/100;
    print STDERR "\r                                                                               ";

    if ($split_to_X_jobs == -1) {
      print STDERR "\rRunning blast analysis: $percent% ($jobs_done/$jobs_todo)";
    }
    else {  # 5.16
      $percent = int($jobs_done/$part*10000)/100;
      print STDERR "\rRunning blast analysis: $percent% ($jobs_done/$part, $jobs_todo in total)";
    }
  }
}

sub match {
  my %length = %{(shift)};
  my $file_i = shift;
  my $file_j = shift;
  my @i = @{(shift)};
  my @j = @{(shift)};

  my %legal_i = &get_legal_matches(\%length,$file_i,$file_j,@i);
  my %legal_j = &get_legal_matches(\%length,$file_j,$file_i,@j);

  return &get_reciprocal_matches(\%legal_i,\%legal_j);
}

sub get_reciprocal_matches {
  my %i = %{(shift)};
  my %j = %{(shift)};
  my %reciprocal;

  foreach (keys %i) {
    my ($i, $j) = split(" ",$_);

    # Reciprocal hit?
    if (!defined($j{"$j $i"})) {next;}

    # Merge both
    $reciprocal{$_} = $i{$_}." ".$j{"$j $i"}; # evalue_ij, bitscore_ij, evalue_ji, bitscore_ji
  }

  return \%reciprocal;
}

sub get_legal_matches {
  my %length = %{(shift)};
  my $file_i = shift;
  my $file_j = shift;

  my %result;
  foreach (@_) {
    my ($query_id,$subject_id,$local_identity,$alignment_length,$mismatches,$openings,$query_start,$query_end,$subject_start,$subject_end,$local_evalue,$local_bitscore) = split(/\s+/);

    if(!defined($query_id) || $query_id eq "" || !defined($subject_id) || $subject_id eq ""){next}

    if($blastmode eq "mmseqsp" || $blastmode eq "mmseqsn"){$local_identity = $local_identity*100;}
    if ($debug>1) {print STDERR "?$query_id -> $subject_id ($local_evalue,$local_bitscore)\n";}

    # Bug tracking
    unless ( defined($length{$query_id}) )  {if($query_id ne '#'){print STDERR "$RED [ERROR] Query gene ID '$query_id' is present in blast output but was not present in FASTA input! Skipping line. I will continue, but results may be insufficien $NC \n";} next;}
    unless ( defined($length{$subject_id}) )  {if($query_id ne '#'){print STDERR "$RED [ERROR] Subject gene ID '$subject_id' is present in blast output but was not present in FASTA input! Skipping line. I will continue, but results may be insufficien $NC \n";} next;}

    ## Check for criteria
    # Well formatted
    if (!defined($local_bitscore)){next;}
    if ($evalue < $local_evalue) {next;} # 5.17, post filter e-value

    # Percent identity
    if (!$twilight && $local_identity < $identity)          {if ($debug>1) {print STDERR "!$query_id -> $subject_id is removed because of identity threshold\n";} next;}
    if ( $twilight && $local_identity < &identitybylength($alignment_length)) {if ($debug>1) {print STDERR "!$query_id -> $subject_id is removed because of identity by length threshold\n";} next;}
    # Min. length
    if ($blastmode eq "tblastx+" || $blastmode eq "tblastx") {$alignment_length *= 3;}

    #if($blastmode eq "autoblast"){
    #  if($file_i eq "nucl" && $file_j eq "nucl"){
    #    $alignment_length /= 3;
    #  }
    #  # if i is nucl and j is prot -> then the alignment length is in aa length (all nucl lengths are allready /3) -> ok
    #}

    if ($alignment_length < $length{$query_id}*($coverage/100)+0.5)     {if ($debug>1) {print STDERR "!$query_id -> $subject_id is removed because of coverage threshold (alignment_length=$alignment_length<".( $length{$query_id}*($coverage/100)+0.5)."=$length{$query_id}*($coverage/100)+0.5)=length{query_id}*(coverage/100)+0.5,$query_id)\n";} next;}
    if ($alignment_length < $length{$subject_id}*($coverage/100)+0.5)   {if ($debug>1) {print STDERR "!$query_id -> $subject_id is removed because of coverage threshold (alignment_length=$alignment_length<".( $length{$subject_id}*($coverage/100)+0.5)."=$length{$subject_id}*($coverage/100)+0.5)=length{subject_id}*(coverage/100)+0.5,$subject_id)\n";} next;}

    if($isoform ne ""){

      my $query_id_iso=$query_id;
      my $subject_id_iso=$subject_id;
      if(exists $isoform_mapping{$file_i." ".$query_id} ){   $query_id_iso   = $isoform_mapping{$file_i." ".$query_id} }
      if(exists $isoform_mapping{$file_j." ".$subject_id} ){ $subject_id_iso = $isoform_mapping{$file_j." ".$subject_id} }

      # It hit itself (only during selfblast)
      # if ($selfblast && $query_id eq $subject_id)           {next;} # 5.16 reuse IDs
      ## Listing them in the graph is okay, clustering will ignore them

      # Similar hits? Take the better one
      if (defined($result{"$query_id_iso $subject_id_iso"})) {
        my ($remote_evalue, $remote_bitscore) = split(" ",$result{"$query_id_iso $subject_id_iso"});
        if ($local_evalue > $remote_evalue) {next;}
        if ($local_bitscore < $remote_bitscore) {next;}
      }

      # Store data
      if ($debug>1) {print STDERR "!$query_id -> $subject_id ($local_evalue,$local_bitscore)\n";}
      $result{"$query_id_iso $subject_id_iso"} = "$local_evalue $local_bitscore";

    }else{

      # It hit itself (only during selfblast)
      # if ($selfblast && $query_id eq $subject_id)           {next;} # 5.16 reuse IDs
      ## Listing them in the graph is okay, clustering will ignore them

      # Similar hits? Take the better one
      if (defined($result{"$query_id $subject_id"})) {
        my ($remote_evalue, $remote_bitscore) = split(" ",$result{"$query_id $subject_id"});
        if ($local_evalue > $remote_evalue) {next;}
        if ($local_bitscore < $remote_bitscore) {next;}
      }

      # Store data
      if ($debug>1) {print STDERR "!$query_id -> $subject_id ($local_evalue,$local_bitscore)\n";}
      $result{"$query_id $subject_id"} = "$local_evalue $local_bitscore";
    }
   
  }

  return %result;
}

# Auto set the number of CPUs
sub auto_cpus {
  if ($cpus == 0) {
    my $cpu_x = qx(getconf _NPROCESSORS_ONLN);    $cpu_x =~ s/[^0-9]//g;
    # Fallback
    if (length($cpu_x) == 0 || $cpu_x == 0) {
      # Linux
      if (-e "/proc/cpuinfo") {
        $cpu_x = qx(grep processor /proc/cpuinfo | wc -l);
      }
      # Try Mac
      else {
        $cpu_x = qx(system_profiler | grep CPUs:);
      }
      $cpu_x =~ s/[^0-9]//g;
    }
    if (length($cpu_x) == 0 || $cpu_x == 0) {
      print STDERR "failed! Use 1 core only\n";$cpu_x = 1;
    }
    if($verbose){print STDERR "Detected $cpu_x available CPU threads (adjust this with -cpus), ";}
    $cpus = int($cpu_x);
  }
  else {
    if($verbose){print STDERR "Using $cpus CPU threads, ";}
  }
}

sub log_Worker {
  local $SIG{KILL} = sub { threads->exit };
  my $tid = threads->tid();
  if($debug){ print STDERR "log_Worker : $tid spawn\n"; }
  while($do_log){
    while($log_QUEUE->pending()){
      if($debug){ print STDERR "log_Worker inside : $tid active threads=".(scalar grep { $_->is_running() } threads->list())." log_QUEUE=".$log_QUEUE->pending()." step1_QUEUE=".$step1_QUEUE->pending()." check_files_QUEUE=".$check_files_QUEUE->pending()."\n"; }
      my $log = $log_QUEUE->dequeue();
      if($verbose){ print STDERR $log; }
    }
    if($debug){ print STDERR "log_Worker : $tid active threads=".(scalar grep { $_->is_running() } threads->list())." log_QUEUE=".$log_QUEUE->pending()." step1_QUEUE=".$step1_QUEUE->pending()." check_files_QUEUE=".$check_files_QUEUE->pending()."\n"; }
    if(!$log_QUEUE->pending() && !$step1_QUEUE->pending() && !$check_files_QUEUE->pending() && (scalar grep { $_->is_running() } threads->list()) <= 0){$do_log=0}
    sleep 1;
  }
  if($debug){ print STDERR "log_Worker : $tid dead\n"; }
  threads->exit;
}
sub generate_indices_Worker {
  my $tid = threads->tid();
  local $SIG{KILL} = sub { threads->exit };
  if($debug){print STDERR "generate_indices_Worker:$tid spawn\n"}
  while($step1_QUEUE->pending()){
    if($debug){print STDERR "generate_indices_Worker:$tid while\n"}
    my $file = $step1_QUEUE->dequeue();
    if(!defined($file)){last;}
    if($debug){print STDERR "generate_indices_Worker:$tid working on $file\n"}
    &generate_indices($file);
    if($debug){print STDERR "generate_indices_Worker:$tid ok with $file\n"}
  }
  if($debug){print STDERR "generate_indices_Worker:$tid dead\n"}
  threads->exit;
}
sub generate_indices {
  my $file=$_[0];

  my $oldkeep=$keep;
  my $cmd="";
  my $msg="";
  if($debug){print STDERR "generate_indices:$file start\n"}
  
  if($verbose){$msg.= "Generating indices";if($force){$msg.= " anyway (forced).\n"}else{$msg.= ".\n";}}
  if ($blastmode eq "rapsearch") {
    #foreach my $file (@_) {
      if(!$force && `ls '${file}'.${blastmode}* 2>/dev/null` ne ""){
        if ($verbose) {$msg.= "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {$msg.= "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        $cmd="$makedb -d '$file' -n '$file.$blastmode'";
        my $makedb_ret = `$cmd 2>&1`;
        if ($? != 0) {$msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb -d '$file' -n '$file.$blastmode'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){$msg.= "$ORANGE Please ensure that you use -tmp=$tmp_path -keep (and use the same -project= name) for future analysis.$NC";}print "\n";
          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          $cmd = "$makedb -d '$file' -n '$tmp_path/DB/".basename($file).".$blastmode'";
          $makedb_ret = `$cmd 2>&1`;
          if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
          system("ln -s '".abs_path($file)."' '$tmp_path/DB/".basename($file)."' 2>/dev/null");
        }
      }
    #}
  }
  elsif ($blastmode eq "diamond" ) {
    #foreach my $file (@_) {
      if(!$force && `ls '${file}'.${blastmode}* 2>/dev/null` ne ""){
        if ($verbose) {$msg.= "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {$msg.= "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        $cmd="$makedb '$file' -d '$file.$blastmode' --quiet";
        if($debug){print STDERR "generate_indices:$file $cmd\n"}
        my $makedb_ret = `$cmd 2>&1`;
        if($debug){print STDERR "generate_indices:$file $cmd DONE\n"}
        if ($? != 0 && $makedb_ret =~/--ignore-warnings/) {
          $msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb '$file' -d '$file.$blastmode'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now retry with the '--ignore-warnings' option!\n");
          $cmd="$makedb '$file' -d '$file.$blastmode' --quiet";
          if($debug){print STDERR "$cmd\n"}
          my $makedb_ret = `$cmd --ignore-warnings 2>&1`;
          if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
        }elsif($? != 0){$msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb '$file' -d '$file.$blastmode'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){$msg.= "$ORANGE Please ensure that you use -tmp=$tmp_path -keep (and use the same -project= name) for future analysis.$NC";}print "\n";
          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          $cmd = "$makedb '$file' -d '$tmp_path/DB/".basename($file).".$blastmode' --quiet";
          if($debug){print STDERR "$cmd\n"}
          $makedb_ret = `$cmd 2>&1`;
          if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
          system("ln -s '".abs_path($file)."' '$tmp_path/DB/".basename($file)."' 2>/dev/null");
        }
      }
    #}
  }
  elsif ($blastmode eq "topaz") {
    #foreach my $file (@_) {
      #if ($file =~ /\s/) {$msg.= ("$ORANGE\n[WARNING]$NC : File name '$file' contains whitespaces. This might lead to undesired effects. If you encounter unusual behavior, please change the file name!$NC\n");}

      if(!$force && `ls '${file}'.${blastmode}* 2>/dev/null` ne ""){
        if ($verbose) {$msg.= "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {$msg.= "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        $cmd="$makedb index -f '$file' -p '$file.$blastmode'";
        my $makedb_ret = `$cmd 2>&1`;
        if ($? != 0) {$msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb index -f '$file' -p '$file.$blastmode'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){$msg.= "$ORANGE Please ensure that you use -tmp=$tmp_path -keep (and use the same -project= name) for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          $cmd = "$makedb index -f '$file' -p '$tmp_path/DB/".basename($file).".$blastmode'";
          $makedb_ret = `$cmd 2>&1`;
          if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
          system("ln -s '".abs_path($file)."' '$tmp_path/DB/".basename($file)."'");
        }
      }
    #}
  }
  elsif ($blastmode eq "mmseqsp") {
    #foreach my $file (@_) {
      if(!$force && `ls '${file}'.${blastmode}* 2>/dev/null` ne ""){
        if ($verbose) {$msg.= "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {$msg.= "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        $cmd="$makedb '$file' '$file.$blastmode'";
        my $makedb_ret = `$cmd 2>&1`;
        if ($? != 0) {$msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb '$file' '$file.$blastmode'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){$msg.= "$ORANGE Please ensure that you use -tmp=$tmp_path -keep (and use the same -project= name) for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          $cmd = "$makedb '$file' '$tmp_path/DB/".basename($file).".$blastmode'";
          $makedb_ret = `$cmd 2>&1`;
          if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
          system("ln -s '".abs_path($file)."' '$tmp_path/DB/".basename($file)."' 2>/dev/null");
        }
      }
    #}
  }
  elsif ($blastmode eq "mmseqsn") {
    #foreach my $file (@_) {
      if(!$force && `ls '${file}'.${blastmode}* 2>/dev/null` ne ""){
        if ($verbose) {$msg.= "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {$msg.= "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        $cmd="$makedb '$file' '$file.$blastmode'";
        my $makedb_ret = `$cmd 2>&1`;
        if ($? != 0) {$msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb '$file' '$file.$blastmode'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){$msg.= "$ORANGE Please ensure that you use -tmp=$tmp_path -keep (and use the same -project= name) for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          $cmd = "$makedb '$file' '$tmp_path/DB/".basename($file).".$blastmode'";
          $makedb_ret = `$cmd 2>&1`;
          if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
          system("ln -s '".abs_path($file)."' '$tmp_path/DB/".basename($file)."' 2>/dev/null");
        }
      }
    #}
  }
  elsif ($blastmode eq "usearch" || $blastmode eq "ublast") {
    #foreach my $file (@_) {
      if(!$force && `ls '${file}'.${blastmode}* 2>/dev/null` ne ""){
        if ($verbose) {$msg.= "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {$msg.= "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        $cmd="$makedb '$file' -output '$file.$blastmode'";
        my $makedb_ret = `$cmd 2>&1`;
        if ($? != 0) {$msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb '$file' -output '$file.$blastmode'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){$msg.= "$ORANGE Please ensure that you use -tmp=$tmp_path -keep (and use the same -project= name) for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          $cmd = "$makedb '$file' -output '$tmp_path/DB/".basename($file).".$blastmode'";
          $makedb_ret = `$cmd 2>&1`;
          if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
          system("ln -s '".abs_path($file)."' '$tmp_path/DB/".basename($file)."' 2>/dev/null");
        }
      }
    #}
  }
  elsif ($blastmode eq "lastp" || $blastmode eq "lastn") {
    #foreach my $file (@_) {
      if(!$force && `ls '${file}'.${blastmode}* 2>/dev/null` ne ""){
        if ($verbose) {$msg.= "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {$msg.= "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        if($blastmode eq "lastp"){
          $cmd="$makedb -p '$file.$blastmode' '$file'";
          my $makedb_ret = `$cmd 2>&1`;
          if ($? != 0) {$msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb -p '$file.$blastmode' '$file'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){$msg.= "$ORANGE Please ensure that you use -tmp=$tmp_path -keep (and use the same -project= name) for future analysis.$NC";}print "\n";

            mkdir("$tmp_path/DB");
            if($step==1){$oldkeep=$keep;$keep=1;}
            $cmd = "$makedb -p '$tmp_path/DB/".basename($file).".$blastmode' '$file'";
            $makedb_ret = `$cmd 2>&1`;
            if($?!=0){ &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably }something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So try to update $blastmode, consider another blast algorithm (-p) or consider to submitting this case to .");}
            system("ln -s '".abs_path($file)."' '$tmp_path/DB/".basename($file)."' 2>/dev/null");
          }
        }else{
          $cmd="$makedb '$file.$blastmode' '$file'";
          my $makedb_ret = `$cmd 2>&1`;
          if ($? != 0) {$msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb '$file.$blastmode' '$file'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){$msg.= "$ORANGE Please ensure that you use -tmp=$tmp_path -keep (and use the same -project= name) for future analysis.$NC";}print "\n";

            mkdir("$tmp_path/DB");
            if($step==1){$oldkeep=$keep;$keep=1;}
            $cmd = "$makedb '$tmp_path/DB/".basename($file).".$blastmode' '$file'";
            $makedb_ret = `$cmd 2>&1`;
            if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}
            system("ln -s '".abs_path($file)."' '$tmp_path/DB/".basename($file)."' 2>/dev/null");
          }
        }
      }
    #}
  }
  elsif ($blastmode =~ m/.*blat.*/) {
    if($verbose){$msg.= "No indices needed\n";}
  }
  elsif ($blastmode =~ m/blast.*\+$/) {  # new blast+
    #foreach my $file (@_) {
      if(!$force && `ls '${file}'.${blastmode}* 2>/dev/null` ne ""){
        if ($verbose) {$msg.= "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {$msg.= "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        if ($debug) {$msg.= "$makedb '$file' -out '$file.$blastmode'\n";}
        $cmd="$makedb '$file' -out '$file.$blastmode'";
        my $makedb_ret = `$cmd 2>&1`;
        if ($? != 0) {$msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb '$file' -out '$file.$blastmode'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){$msg.= "$ORANGE Please ensure that you use -tmp=$tmp_path -keep (and use the same -project= name) for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          $cmd = "$makedb '$file' -out '$tmp_path/DB/".basename($file).".$blastmode'";
          $makedb_ret = `$cmd 2>&1`;
          system("ln -s '".abs_path($file)."' '$tmp_path/DB/".basename($file)."' 2>/dev/null");

          if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}}
      }
    #}
    unlink('formatdb.log') if -e 'formatdb.log';
  }elsif ($blastmode =~ m/autoblast/) {  # new blast+
    #foreach my $file (@_) {
      if(!$force && `ls '${file}'.${blastmode}* 2>/dev/null` ne ""){
        if ($verbose) {$msg.= "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {$msg.= "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        my $abs_path_file=abs_path "$file "; $abs_path_file=~s/ $//g;
        if(!exists $autoblast_fileis{$abs_path_file}){
          $msg.= "$ORANGE\n[WARNING]$NC I could not detect the type of '$file', i assume aminoacid sequences...\n";
          $autoblast_fileis{$abs_path_file}="prot";
        }
        if ($debug) {$msg.= "$makedb -dbtype ".$autoblast_fileis{$abs_path_file}." -in '$file' -out '$file.$blastmode'\n";}
        $cmd="$makedb -dbtype ".$autoblast_fileis{$abs_path_file}." -in '$file' -out '$file.$blastmode'";
        my $makedb_ret = `$cmd 2>&1`;
        if ($? != 0) {$msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb -dbtype ".$autoblast_fileis{$abs_path_file}." -in '$file' -out '$file.$blastmode'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){$msg.= "$ORANGE Please ensure that you use -tmp=$tmp_path -keep (and use the same -project= name) for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          $cmd = "$makedb '$file' -out '$tmp_path/DB/".basename($file).".$blastmode'";
          $makedb_ret = `$cmd 2>&1`;
          system("ln -s '".abs_path($file)."' '$tmp_path/DB/".basename($file)."' 2>/dev/null");

          if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}}
      }
    #}
    unlink('formatdb.log') if -e 'formatdb.log';
  }else { # old blastall
    #foreach my $file (@_) {
      if(!$force && `ls '${file}'.${blastmode}* 2>/dev/null` ne ""){
        if ($verbose) {$msg.= "The database for '$file' is present and will be used\n";}
      }else{
        if ($verbose) {$msg.= "Building database for '$file'\t(".$gene_counter{$file}." sequences)\n";}
        if ($debug) {$msg.= "$makedb '$file' -out '$file.$blastmode'\n";}
        $cmd="$makedb '$file' -n '$file.$blastmode'";
        my $makedb_ret = `$cmd 2>&1`;
        if ($? != 0) {$msg.= ("$ORANGE\n[WARNING]$NC ".$blastmode." failed to create a database ($makedb '$file' -n '$file.$blastmode'). The output is:\n-----------------\n$makedb_ret-----------------\nI will now proceed with writing the database files to the DB/ directory in $tmp_path (-tmp)."); if($step==1){$msg.= "$ORANGE Please ensure that you use -tmp=$tmp_path -keep (and use the same -project= name) for future analysis.$NC";}print "\n";

          mkdir("$tmp_path/DB");
          if($step==1){$oldkeep=$keep;$keep=1;}
          $cmd = "$makedb '$file' -n '$tmp_path/DB/".basename($file).".$blastmode'";
          $makedb_ret = `$cmd 2>&1`;
          system("ln -s '".abs_path($file)."' '$tmp_path/DB/".basename($file)."' 2>/dev/null");

          if($?!=0){ $keep=$oldkeep; &Error($makedb_ret."\nThe database generation failed once again, please investigate the output from above. There is probably something wrong with the fasta files or the version of $blastmode cannot handle the database generation. So please try one of the following:\n- update $blastmode\n- consider another blast algorithm (-p)\n- send this issue to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com.");}}
      }
    #}
    unlink('formatdb.log') if -e 'formatdb.log';
  }
##MARK_FOR_NEW_BLAST_ALGORITHM
  if($debug){print STDERR "generate_indices:$file almost end\n"}
  if($verbose){print STDERR $msg}
  #$log_QUEUE->enqueue($msg);
  if($debug){print STDERR "generate_indices:$file end\n"}
  return;
}

sub blast {

  my $command = "";
  my @data; # return table

  my $a = $_[0];
  my $b = $_[1];
  my $thread_id = $_[2];
  #if( $blastmode eq "mmseqsp" || $blastmode eq "mmseqsn" || $blastmode eq "mmseqs" ){ if(!-d "$tmp_path/mmseqs".$thread_id ){mkdir("$tmp_path/mmseqs".$thread_id);}else{rmdir("$tmp_path/mmseqs".$thread_id);mkdir("$tmp_path/mmseqs".$thread_id);}}

  $a = basename($a);
  $b = basename($b);

  my $bla = "$b.vs.$a.".$blastmode;

  my $WD="${tmp_path}/${bla}_WD";
  mkdir($WD);

  my $blastOptions_diamond=''; #additional option --sensitive (if gene length is larger than 40)
  my $max_genlen = 1;
    if($blastmode eq "diamond"){
      if (index($blastOptions, "--ultra-sensitive") == -1 && 
          index($blastOptions, "--very-sensitive") == -1 &&  
          index($blastOptions, "--mid-sensitive") == -1 &&  
          index($blastOptions, "--fast") == -1 &&  
          index($blastOptions, "--more-sensitive") == -1 && index($blastOptions, "--sensitive") == -1) {
      if( $max_gene_length_diamond{$a} > 40 || $max_gene_length_diamond{$b} > 40 ){ $blastOptions_diamond = "--sensitive" }
    }
  }

  my $printSTDERR="";
  if($verbose != 2){ $printSTDERR='2>/dev/null'; }

  my ($fileType) = $a =~ /\.(\w*)$/;

  $a = abs_path "$_[0] "; # get a b again for blast algorithm (now they are not basenamed)
  $b = abs_path "$_[1] ";
  $a=~s/ $//g; # this is a trick, to get the abs_path of the symlink (not the target) 1. add whitespace to filename 2. call abs_path 3. remove whitespace
  $b=~s/ $//g; # otherwise the target is printed, which does not neccesarilly contain the DB files

  if(-e $tmp_path."/DB/".basename($a)){ $a = "../DB/".basename($a); }

  #
  # 1. run the blast algorithm -> produce a $bla.tmp
  # 2. do some post-processing 
  # 3. rename to $bla
  # 4. load $bla to @data 
  # 5. delete $bla unless -keep
  #
  # note: e.g. diamond can directly pass the output to @data
  # 
  # sh -ic '{ cat 1>&3; kill 0; } | { sleep 2; kill 0; }' 3>&1

  if    ($blastmode eq "blastp_legacy" || $blastmode eq "blastn_legacy" || $blastmode eq "tblastx_legacy") {lock($threads_per_process); $command = $binpath."blastall -a $threads_per_process -d '$a.$blastmode' -i '$b' -p $blastmode -m8 -e $evalue $blastOptions $printSTDERR >'../$bla.tmp'";}# .($keep ? " | tee '../$bla.tmp'" : "");}
  elsif ($blastmode eq "autoblast" && $autoblast_fileis{$a} eq "prot" && $autoblast_fileis{$b} eq "prot")  {lock($threads_per_process); $command = $binpath."blastp -num_threads $threads_per_process -db '$a.$blastmode' -query '$b' -evalue $evalue -outfmt 6 $blastOptions $printSTDERR >'../$bla.tmp'";}# .($keep ? " | tee '../$bla.tmp'" : "");}
  elsif ($blastmode eq "autoblast" && $autoblast_fileis{$a} eq "nucl" && $autoblast_fileis{$b} eq "nucl")  {lock($threads_per_process); $command = $binpath."blastn -num_threads $threads_per_process -db '$a.$blastmode' -query '$b' -evalue $evalue -outfmt 6 $blastOptions $printSTDERR >'../$bla.tmp'";}# .($keep ? " | tee '../$bla.tmp'" : "");}
  elsif ($blastmode eq "autoblast" && $autoblast_fileis{$a} eq "nucl" && $autoblast_fileis{$b} eq "prot")  {lock($threads_per_process); $command = $binpath."tblastn -num_threads $threads_per_process -db '$a.$blastmode' -query '$b' -evalue $evalue -outfmt 6 $blastOptions $printSTDERR >'../$bla.tmp'";}# .($keep ? " | tee '../$bla.tmp'" : "");}
  elsif ($blastmode eq "autoblast" && $autoblast_fileis{$a} eq "prot" && $autoblast_fileis{$b} eq "nucl")  {lock($threads_per_process); $command = $binpath."blastx -num_threads $threads_per_process -db '$a.$blastmode' -query '$b' -evalue $evalue -outfmt 6 $blastOptions $printSTDERR >'../$bla.tmp'";}# .($keep ? " | tee '../$bla.tmp'" : "");}
  elsif ($blastmode eq "blastp+")   {lock($threads_per_process); $command = $binpath."blastp -num_threads $threads_per_process -db '$a.$blastmode' -query '$b' -evalue $evalue -outfmt 6 $blastOptions $printSTDERR >'../$bla.tmp'";}# .($keep ? " | tee '../$bla.tmp'" : "");}
  elsif ($blastmode eq "blastn+")   {lock($threads_per_process); $command = $binpath."blastn -num_threads $threads_per_process -db '$a.$blastmode' -query '$b' -evalue $evalue -outfmt 6 $blastOptions $printSTDERR >'../$bla.tmp'";}# .($keep ? " | tee '../$bla.tmp'" : "");}
  elsif ($blastmode eq "tblastx+")  {lock($threads_per_process); $command = $binpath."tblastx -num_threads $threads_per_process -db '$a.$blastmode' -query '$b' -evalue $evalue -outfmt 6 $blastOptions $printSTDERR >'../$bla.tmp'";}# .($keep ? " | tee '../$bla.tmp'" : "");}
  elsif ($blastmode eq "blatp")     {lock($threads_per_process); $command = $binpath."blat -prot '$a' '$b' -out=blast8 '../$bla.tmp' $blastOptions >\/dev\/null $printSTDERR";}
  elsif ($blastmode eq "blatn")     {lock($threads_per_process); $command = $binpath."blat '$a' '$b' -out=blast8 '../$bla.tmp' $blastOptions >\/dev\/null $printSTDERR";}
  elsif ($blastmode eq "blatx")     {lock($threads_per_process); $command = $binpath."blat '$a' '$b' -t=dnax -q=dnax -out=blast8 '../$bla.tmp' $blastOptions >\/dev\/null $printSTDERR";}
  #elsif ($blastmode eq "pblatp")      {lock($threads_per_process); $command = $binpath."pblat -prot '$a' '$b' -threads=$threads_per_process -out=blast8 '../$bla' >\/dev\/null 2>&1";}
  #elsif ($blastmode eq "pblatn")      {lock($threads_per_process); $command = $binpath."pblat '$a' '$b' -threads=$threads_per_process -out=blast8 '../$bla' >\/dev\/null 2>&1";}
  #elsif ($blastmode eq "pblatx")      {lock($threads_per_process); $command = $binpath."pblat '$a' '$b' -t=dnax -q=dnax -threads=$threads_per_process -out=blast8 '../$bla' >\/dev\/null 2>&1";}
  elsif ($blastmode eq "diamond" )  {lock($threads_per_process); $command = $binpath."diamond blastp --threads $threads_per_process --db '$a.$blastmode' --query '$b' -e $evalue --outfmt 6 --quiet $blastOptions $blastOptions_diamond $printSTDERR >'../$bla.tmp'";}# .($keep ? " | tee '../$bla.tmp'" : "");}
  elsif ($blastmode eq "topaz" )    {lock($threads_per_process); $command = $binpath."topaz search -T $threads_per_process -p '$a.$blastmode' -f '$b' -E $evalue --blasttab $blastOptions >'../$bla.tmp' $printSTDERR";}
  elsif ($blastmode eq "ublast")    {lock($threads_per_process); $command = $binpath."usearch -ublast '$b' -db '$a.$blastmode' -threads $threads_per_process -evalue $evalue -blast6out '../$bla.tmp' $blastOptions >\/dev\/null $printSTDERR";}
  elsif ($blastmode eq "usearch")   {lock($threads_per_process); $command = $binpath."usearch -usearch_local '$b' -db '$a.$blastmode' -threads $threads_per_process -evalue $evalue -id 0 -blast6out '../$bla.tmp' $blastOptions >\/dev\/null $printSTDERR";}
  elsif ($blastmode eq "rapsearch") {lock($threads_per_process); $command = $binpath."rapsearch -s T -d '$a.$blastmode' -q '$b' -z $threads_per_process -e $evalue -out '../$bla.tmp' $blastOptions >\/dev\/null $printSTDERR";}
  elsif ($blastmode eq "lastp")     {lock($threads_per_process); $command = $binpath."lastal -E$evalue -fBlastTab+ -pBL62 -P$threads_per_process '$a.$blastmode' '$b' $blastOptions > '../$bla.tmp' $printSTDERR";}
  elsif ($blastmode eq "lastn")     {lock($threads_per_process); $command = $binpath."lastal -E$evalue -fBlastTab+ -P$threads_per_process '$a.$blastmode' '$b' $blastOptions > '../$bla.tmp' $printSTDERR";}
  elsif ($blastmode eq "mmseqsn" || $blastmode eq "mmseqsp")       {lock($threads_per_process); $command = $binpath."mmseqs search '$b.$blastmode' '$a.$blastmode' '../$bla.tmp' mmseqs$thread_id --threads $threads_per_process -e $evalue $blastOptions >\/dev\/null $printSTDERR";}
  else  {&Error("This should not happen! Please submit the FASTA file(s) and the parameter vector (above) to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com to help fixing this issue.");}
  ##MARK_FOR_NEW_BLAST_ALGORITHM

  if($identical || $range>=0){ $command=~s/-?-outfmt 6 (?!qseqid)//g; } # remove additional outfmt 6 (not followed by qseqid)
  # NOTE diamond fails on --outfmt '6 qseqid ...' but is ok with --outfmt 6 qseqid ... 

  my $plain_command=$command;
  if ( $blastmode eq "mmseqsn" || $blastmode eq "mmseqsp" ){
    $command = "(cd '$WD' && $command)\n";
  }else{
    $command = "cat << 'EOF'|sh;\n$proteinortho_exec; (cd '$WD' && proteinortho_exec \"($command)\")\nEOF\n";
  }

  # File does not exists yet or I am forced to rewrite it
  if (!(-e "$tmp_path/$bla") || $force) {

    if (-e "$tmp_path/$bla" && $force) { unlink("$tmp_path/$bla"); }

    if ($debug || $verbose==2) {print STDERR "$command\n";}                     # 5.16

    if ($blastmode eq "diamond") {
      
      system("$command");
      if ($? != 0) {&TypicalBlastError($blastmode,$plain_command,$?,$!);}
      #if ($? != 0) {
      #  $command = $binpath."diamond blastp --ignore-warnings --threads $threads_per_process --db '$a.$blastmode' --query '$b' -e $evalue --outfmt 6 --quiet $blastOptions $blastOptions_diamond $printSTDERR ".($keep ? " | tee '../$bla.tmp'" : "");
      #  @data=`(cd '$WD' && $command)`;
      #  if ($? != 0) {&TypicalBlastError($blastmode,$plain_command,$?,$!);}
      #}
      if($debug eq "test_sort"){while (<"$tmp_path/$bla.tmp">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $tmp_path/$bla\n";&reset_locale();die;}}}
      system("mv '$tmp_path/$bla.tmp' '$tmp_path/$bla'");

    }elsif ($blastmode eq "rapsearch") {

      system("$command");
      if ($? != 0) {&TypicalBlastError($blastmode,$plain_command,$?,$!);}
      # NOTE -s f does not work everytime, therefore here is a conversion of -s t to f (ln -> evalues)
      open(OUTM,">>$tmp_path/$bla.m82");open(INM,"<$tmp_path/$bla.tmp.m8");
      while (<INM>){if(length($_)==0 || substr($_,0,1)eq"#"){next;}my @arr=split("\t",$_); print OUTM $arr[0]."\t".$arr[1]."\t".$arr[2]."\t".$arr[3]."\t".$arr[4]."\t".$arr[5]."\t".$arr[6]."\t".$arr[7]."\t".$arr[8]."\t".$arr[9]."\t".exp($arr[10])."\t".$arr[11];}
      close(OUTM);close(INM);
      if($debug eq "test_sort"){while (<"$tmp_path/$bla.tmp.m8">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $tmp_path/$bla.m8\n";&reset_locale();die;}}}
      unlink "$tmp_path/$bla.aln"; #remove aln file
      system("tail -n +6 '$tmp_path/$bla.m82' >'$tmp_path/$bla'"); # remove head/comment lines of rapsearch
      unlink "$tmp_path/$bla.m82";
      unlink "$tmp_path/$bla.tmp.m8";

    }elsif ($blastmode eq "usearch" || $blastmode eq "ublast") {

      system("$command");
      if ($? != 0) {&TypicalBlastError($blastmode,$plain_command,$?,$!);}
      if($debug eq "test_sort"){while (<"$tmp_path/$bla.tmp">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $tmp_path/$bla\n";&reset_locale();die;}}}
      system("perl $po_path/proteinortho_formatUsearch.pl '$tmp_path/$bla.tmp' >'$tmp_path/$bla'"); # problem with ublast/usearch: gene names include the description.
      unlink "$tmp_path/$bla.tmp";

    }elsif ($blastmode eq "topaz") {

      system("$command");
      if ($? != 0) {&TypicalBlastError($blastmode,$plain_command,$?,$!);}
      if($debug eq "test_sort"){while (<"$tmp_path/$bla.tmp">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $tmp_path/$bla\n";&reset_locale();die;}}}
      system("tail -n +2 '$tmp_path/$bla.tmp' > '$tmp_path/$bla'");
      unlink "timing.txt";
      unlink "$tmp_path/$bla.tmp";

    }elsif ($blastmode eq "lastp" || $blastmode eq "lastn") {

      system("$command");
      if ($? != 0) {&TypicalBlastError($blastmode,$plain_command,$?,$!);}
      if($debug eq "test_sort"){while (<"$tmp_path/$bla.tmp">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $tmp_path/$bla\n";&reset_locale();die;}}}
      system("mv '$tmp_path/$bla.tmp' '$tmp_path/$bla'");

    }elsif ($blastmode =~ m/.*blat.*/) {

      system("$command");
      if ($? != 0) {&TypicalBlastError($blastmode,$plain_command,$?,$!);}
      if($debug eq "test_sort"){while (<"$tmp_path/$bla.tmp">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $tmp_path/$bla\n";&reset_locale();die;}}}
      system('awk -F\'\t\' \'{if($11<'.$evalue.')print $0}\' \''."$tmp_path/$bla".'.tmp\' > \''."$tmp_path/$bla".'\'');
      unlink "$tmp_path/$bla.tmp";
    
    }elsif ($blastmode eq "mmseqsn" || $blastmode eq "mmseqsp") {

      system("$command");
      if ($? != 0) {&TypicalBlastError($blastmode,$plain_command,$?,$!);}

      if($threads_per_process>1){
        system("mv '$tmp_path/${bla}'.tmp.index '$tmp_path/${bla}'.tmp-index");
        system("mv '$tmp_path/${bla}'.tmp.dbtype '$tmp_path/${bla}'.tmp-dbtype");
        system("if [ ! -e '$tmp_path/$bla' ]; then cat '$tmp_path/$bla'.tmp.* > '$tmp_path/$bla'.tmp; fi");
        system("mv '$tmp_path/${bla}'.tmp-index '$tmp_path/${bla}'.tmp.index");
        system("mv '$tmp_path/${bla}'.tmp-dbtype '$tmp_path/${bla}'.tmp.dbtype");
      }
      
      system($binpath."mmseqs convertalis '$b.$blastmode' '$a.$blastmode' '$tmp_path/$bla.tmp' '$tmp_path/$bla.tmp2' >\/dev\/null 2>\/dev\/null");

      if($debug eq "test_sort"){while (<"$tmp_path/$bla.tmp">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $tmp_path/$bla.tmp\n";&reset_locale();die;}}}
      system('sed -re \'s/([0-9\.]+)(E)(\-|\+)([0-9\.]+)/\1e\3\4/g\' \''."$tmp_path/$bla".'.tmp2\' >\''."$tmp_path/$bla".'\' 2>\/dev\/null');

    }else { # -p=blastp,blastn,autoblast, ...

      # @data=`$command`;
      system("$command");
      if ($? != 0) {&TypicalBlastError($blastmode,$plain_command,$?,$!);}
      if($debug eq "test_sort"){while (<"$tmp_path/$bla.tmp">){if ($_ =~ /[^\t]+([,])[^\t]+[eE]/) {print "found forbidden symbol '$1' at $_ in $tmp_path/$bla.tmp\n";&reset_locale();die;}}}
      system("mv '$tmp_path/$bla.tmp' '$tmp_path/$bla'");
    }

##MARK_FOR_NEW_BLAST_ALGORITHM

    if(-e "$tmp_path/$bla" && scalar @data == 0){
      @data = &readFile("$tmp_path/$bla");
      if(!$keep){unlink "$tmp_path/$bla"}
    }

  }else{
    if ($verbose) {print STDERR "\nNote: '$tmp_path/$bla' exists, using pre-calculated data\n";}

    @data = &readFile("$tmp_path/$bla");
    if(!$keep){unlink "$tmp_path/$bla"}
  }

  if($selfblast){
    @data = map {
      my @a = split("\t",$_); 
      ($a[0] ne $a[1]) ? ($_) : () 
    } @data;
  }
  if($identical){
    @data = map {
      my @a = split("\t",$_); 
      (int($a[2]) eq 100 && int($a[6]) == 1 && int($a[8]) == 1 && int($a[9]) == int($a[7]) && int($a[12]) == int($a[7]) && int($a[13]) == int($a[12])) ? (join("\t",@a[0..11])) : ()
    } @data;
  }
  if($range>=0){
    @data = map {
      my @a = split("\t",$_); 
      (abs(int($a[12]) - int($a[13])) <= $range) ? (join("\t",@a[0..11])) : ()
    } @data;
  }
  
  return \@data;
}

sub readFile {
  open(FILE,"<$_[0]") || &Error("Error, could not open file $_[0]: $!");
  my @data;
  @data = <FILE>;
  close(FILE);
  chomp @data;
  return @data;
}

sub failed_to_detected_msg{
  print STDERR ("\n!!!$ORANGE\n[WARNING]$NC Failed to detect '$blastmode'.$NC\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)\nI try now -p=blastp+ as fallback.\n");
  print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
   sleep 10;
  print STDERR "\nWell then, proceeding...\n\n";
}

sub check_bins {

  if($blastmode eq "blast"){print STDERR "$ORANGE\n[WARNING]$NC There is no blastmode -p=$blastmode. I will choose the protein search blastp+ and continue...$NC"; $blastmode="blastp+";}
  if($blastmode eq "last"){print STDERR "$ORANGE\n[WARNING]$NC There is no blastmode -p=$blastmode. I will choose the protein search lastp and continue...$NC";$blastmode.="p";}
  if($blastmode eq "mmseqs"){print STDERR "$ORANGE\n[WARNING]$NC There is no blastmode -p=$blastmode. I will choose the protein search mmseqsp and continue...$NC";$blastmode.="p";}
  if($blastmode =~ m/usearch.+/){print STDERR "$ORANGE\n[WARNING]$NC There is no blastmode -p=$blastmode. I will choose the protein search -p=usearch and continue...$NC";$blastmode="usearch";}
  if($blastmode =~ m/ublast.+/){print STDERR "$ORANGE\n[WARNING]$NC There is no blastmode -p=$blastmode. I will choose the protein search -p=ublast and continue...$NC";$blastmode="ublast";}
  if($blastmode =~ m/rapsearch.+/){print STDERR "$ORANGE\n[WARNING]$NC There is no blastmode -p=$blastmode. I will choose the protein search -p=rapsearch and continue...$NC";$blastmode="rapsearch";}
  ##MARK_FOR_NEW_BLAST_ALGORITHM

  if ($blastmode eq "blastp+" || $blastmode eq "blastn+" || $blastmode eq "tblastx+") {
    my $tmp = $blastmode;
    $tmp =~ s/\+//g;
    my $cmd = $binpath."$tmp -h";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /DESCRIPTION.*?\n\s*(.+)\n/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      # Commands
      if  ($blastmode eq "blastp+") {$makedb = $binpath."makeblastdb $makeBlastOptions -dbtype prot -in";}
      elsif ($blastmode eq "blastn+" || $blastmode eq "tblastx+") {$makedb = $binpath."makeblastdb $makeBlastOptions -dbtype nucl -in";}
      else  {&Error("This should not happen! Please submit the FASTA file(s) and the parameter vector (above) to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com to help fixing this issue.");}

      if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";$blastversion=$versionnumber;}
      return;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$tmp'.\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)");
  }elsif ($blastmode eq "blast+") {
      &Error("Please call -p=blastp+ for protein datasets and -p=blastn+ for nucleotide datasets (and -p=tblastx+ for translated query/db).");
  }
  elsif ($blastmode eq "autoblast" ) {
    my $tmp = $blastmode;
    $tmp =~ s/\+//g;
    my $cmd = $binpath."blastp -h";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /DESCRIPTION.*?\n\s*(.+)\n/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      # Commands
      $makedb = $binpath."makeblastdb $makeBlastOptions ";
      
      if($verbose){print STDERR "Detected 'blast+' version $versionnumber\n";$blastversion=$versionnumber;}
      return;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$tmp'.\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)");
  }elsif ($blastmode eq "blast+") {
      &Error("Please call -p=blastp+ for protein datasets and -p=blastn+ for nucleotide datasets (and -p=tblastx+ for translated query/db).");
  }
  elsif ($blastmode eq "blastp_legacy" || $blastmode eq "blastn_legacy" || $blastmode eq "tblastx_legacy") {
    my $cmd = $binpath."blastall";
    my @blastv = qx($cmd 2>&1);
    foreach (@blastv) {
      $_=~s/[\r\n]+$//;
      if ($_ =~ /blastall.+?([^\s]+)/) {
        my $versionnumber = $1;
        if  ($blastmode eq "blastp_legacy") {$makedb = $binpath."formatdb $makeBlastOptions -p T -o F -i";}
        elsif   ($blastmode eq "blastn") {$makedb = $binpath."formatdb $makeBlastOptions -p F -o F -i";}
        elsif   ($blastmode eq "tblastx") {$makedb = $binpath."formatdb $makeBlastOptions -p F -o F -i";}
        else  {&Error("This should not happen! Please submit the FASTA file(s) and the parameter vector (above to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com to help fixing this issue.");}

        if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";$blastversion=$versionnumber;}
        return;
      }
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/blastall'.\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)");
  }elsif ($blastmode eq "blast") {
    &Error("Please call -p=blastp_legacy for protein datasets and -p=blastn for nucleotide datasets.");
  }
  elsif ($blastmode eq "topaz") {
    my $cmd = $binpath."topaz -h";
    my @topazv = qx($cmd 2>&1);
    foreach (@topazv) {
      $_=~s/[\r\n]+$//;
      if ($_ =~ /usage: TOPAZ(.+)/) {
        $makedb = $binpath."topaz $makeBlastOptions";
        if($verbose){print STDERR "Detected '$blastmode'\n";}
        return;
      }
    }
    &failed_to_detected_msg();
    $blastmode="blastp+";
    goto RESTART;
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  }
  elsif ($blastmode =~ m/^blat[np]/) {
    my $cmd = $binpath."blat";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /.*BLAT.*v\. ([\d\.]*)/) {
      my $versionnumber = $1;
      if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";$blastversion=$versionnumber;}
      $makedb="";
      return;
    }else{
      &failed_to_detected_msg();
      $blastmode="blastp+";
      goto RESTART;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  }elsif ($blastmode eq "blat") {
    &Error("Please call -p=blatp for protein datasets and -p=blatn for nucleotide datasets.");
  }
  # elsif ($blastmode =~ m/^pblat.*/) {
  #   my $cmd = $binpath."pblat";
  #   my $out = qx($cmd);
  #   if (defined($out) && $out =~ /.*BLAT.*v\. ([\d\.]*)/) {
  #     my $versionnumber = $1;
  #     if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";$blastversion=$versionnumber;}
  #     $makedb="";
  #     return;
  #   }
  #   &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  # }
  elsif ($blastmode eq "rapsearch") {
    my $cmd = $binpath."$blastmode -h";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /rapsearch\sv([\d\.]*)/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      $makedb = $binpath."prerapsearch $makeBlastOptions";
     if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";$blastversion=$versionnumber;}
      return;
    }else{
      &failed_to_detected_msg();
      $blastmode="blastp+";
      goto RESTART;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  }
  elsif ($blastmode eq "mmseqsn" || $blastmode eq "mmseqsp") {
    my $cmd = $binpath."mmseqs version";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /([a-zA-Z0-9]+)/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      $makedb = $binpath."mmseqs createdb $makeBlastOptions";
      if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";$blastversion=$versionnumber;}
      return;
    }else{
      print STDERR ("\n!!!$ORANGE\n[WARNING]$NC Failed to detect '$blastmode'.$NC\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)\nI try now -p=blastn+ as fallback.");
      print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
       sleep 10;
      print STDERR "\nWell then, proceeding...\n\n";
      $blastmode="blastp+";
      goto RESTART;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  }elsif ($blastmode eq "mmseqs") {
    &Error("Please call -p=mmseqsp for protein datasets and -p=mmseqsn for nucleotide datasets.");
  }
  elsif ($blastmode eq "diamond") {
    my $cmd = $binpath."diamond version 2>/dev/null";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /diamond\sversion\s(.+)\n/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      $makedb = $binpath."diamond makedb -p $cpus $makeBlastOptions --in";
      if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";$blastversion=$versionnumber;}
      if($versionnumber =~ m/^0\.9\.(\d+)/){ if($1 < 29){
        print STDERR "\n!!!! \nWARNING '$blastmode' version $versionnumber has a known bug that incorrectly computes the length of an alignment, thus the coverage threshold can produce wrong results leading in false negatives. See https://gitlab.com/paulklemm_PHD/proteinortho/issues/24 for more details.\n\n >>> Please update diamond to 0.9.29 or higher <<<\n";
        print STDERR "\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
        sleep 10;
        print STDERR "\nWell then, proceeding...\n\n";} }
      return;
    }else{
      &failed_to_detected_msg();
      $blastmode="blastp+";
      goto RESTART;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  }
  elsif ($blastmode eq "ublast") {
    my $cmd = $binpath."usearch -version";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /usearch\sv(.+)\n/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      $makedb = $binpath."usearch $makeBlastOptions -makeudb_ublast";
      if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";$blastversion=$versionnumber;}
      return;
    }else{
      &failed_to_detected_msg();
      $blastmode="blastp+";
      goto RESTART;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  }
  elsif ($blastmode eq "usearch") {
    my $cmd = $binpath."$blastmode -version";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /usearch\sv(.+)\n/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      $makedb = $binpath."usearch $makeBlastOptions -makeudb_usearch";
      if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";$blastversion=$versionnumber;}
      return;
    }else{
      &failed_to_detected_msg();
      $blastmode="blastp+";
      goto RESTART;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/$blastmode'\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...).");
  }
  elsif ($blastmode eq "lastp" || $blastmode eq "lastn") {
    my $cmd = $binpath."lastal -V";
    my $out = qx($cmd 2>&1);
    if (defined($out) && $out =~ /lastal\s(.+)\n/) {
      my @version = split(/\s+/,$1);
      my $versionnumber = pop @version;
      $makedb = $binpath."lastdb $makeBlastOptions";
      if($verbose){print STDERR "Detected '$blastmode' version $versionnumber\n";$blastversion=$versionnumber;}
      return;
    }else{
      &failed_to_detected_msg();
      $blastmode="blastp+";
      goto RESTART;
    }
    &Error("Failed to detect '$blastmode'! Tried to call '$binpath/lastal'.$NC\nPlease install $blastmode in $binpath (or specify another binpath with -binpath=/home/...)");
  }elsif ($blastmode eq "last" || $blastmode eq "lastal") {
    &Error("Please call -p=lastp for protein datasets and -p=lastn for nucleotide datasets.");
  }
  ##MARK_FOR_NEW_BLAST_ALGORITHM

  &Error("Blast mode '$blastmode' is not supported. Feel free to ask the author to add it.");
}

# Check plausibility of files
sub check_files {
  if ( ( scalar(@files) == 0 || (scalar(@files) == 1 && $selfblast==0) ) && $step != 3)   {&print_usage_more; &Error("I need at least two files to compare something!");}
  if($verbose){print STDERR "Checking input files";if($checkfasta){print STDERR " carefully (-checkfasta).\n"}else{print STDERR ".\n";}}

  #foreach my $file (@files) { $check_files_QUEUE->enqueue($file); }

  my %files_dedup;
  # 6.0.35: galaxy error (provides gff by default -> tries to read the gff as fasta files -> errors)
  @files = grep { !( $_ =~ /\.g[ft]f[34]?$/ ) } @files;

  foreach my $file (@files) {
    if($debug){print STDERR "->$file\n";}
    if(-e $file && $file=~/\.gz$/){
      if($verbose){print STDERR "ERROR input '$file' seems to be compressed...\n";}
      exit 1;
    }elsif(-e $file && exists $files_dedup{abs_path($file)}){
      if($verbose){print STDERR "found duplicated file $file, I will ignore it ... \n";}
    }elsif(-e $file){
      $files_dedup{abs_path($file)}=$file;
      #if ($verbose) {print STDERR "Checking $file... ";}
      &read_details($file,0);
      #$check_files_QUEUE->enqueue($file);
      #if ($blastmode eq "autoblast") {print STDERR " (".$autoblast_fileis{$file}.") ";}
      #if ($verbose) {print STDERR "ok\n";}
    }elsif(!-e $file){
      &Error("I cannot find $file...");
    }elsif(!-r $file){
      &Error("I cannot read $file...");
    }
  }
  @files=values(%files_dedup);

  if($debug){print STDERR "Checking input files queue:".($check_files_QUEUE->pending())."\n"}

  my $all_only_one_entry=1;
  foreach my $file (@files) {
    if($gene_counter{$file}>1){$all_only_one_entry=0;}
  }
  if(!$force && $all_only_one_entry && $allowedAlphabet->{$blastmode} eq "n"){
    print STDERR "[WARNING]\n[WARNING] (1) all input files contain only one entry and (2) you requested a nucleotide blast algorithm. It looks like you inputted raw genomes and this is not supported by proteinortho ! Please extract e.g. coding or non-coding sequences of the input genomes (using the gff/gtf feature table and e.g. bedtools) or predict your feature sequences with e.g. augustus. Furthermore, transcriptome files will work just fine here.\nI will proceed in 5 seconds anyway but be careful!\n";
    sleep 5;
  }elsif($all_only_one_entry){
    print STDERR "[WARNING]\n[WARNING] All input files contain only one entry. It looks like you maybe inputted raw genomes or single proteins. This is not the expected input for proteinortho ...\nI will proceed in 5 seconds anyway but be careful!\n";
    sleep 5;
  }
}
sub check_files_Worker {
  local $SIG{KILL} = sub { threads->exit };
  my $tid = threads->tid();
  if($debug){print STDERR "check_files_Worker:$tid spawn\n"}
  my %seen_files;
  while($check_files_QUEUE->pending()){
    my $file = $check_files_QUEUE->dequeue();
    if(!defined($file)){last;}
    if(exists $seen_files{abs_path($file)}){next}
    $seen_files{abs_path($file)}=1;
    if($debug){print STDERR "check_files_Worker:$tid working on $file\n"}
    &read_details($file,0);
  }
  if($debug){print STDERR "check_files_Worker:$tid dead\n"}
  threads->exit;
}

sub convertUniprotAndNCBI {
  my $long_id = shift;
  $long_id =~ s/\|$//g;
  my @tmp = split(/\|/,$long_id); 
  if(scalar @tmp > 3){
    return pop(@tmp); # take the last column for NCBI format like patterns (e.g. gi|158333234|ref|YP_001514406.1|)
  }elsif(scalar @tmp == 3){
    return $tmp[1]; # uniprot tr|A0A0A0MPE6|A0A0A0MPE6_DANRE -> A0A0A0MPE6
  }else{
    return $long_id; # is neither ncbi nor uniprot 
  } 
}

sub read_details {
  my $file = shift;
  if($debug){print STDERR "read_details($file)\n"}
  my $abs_path_file=abs_path "$file "; $abs_path_file=~s/ $//g;
  my $from_synteny_call = shift;
  my %ids;        # local test for duplicated IDs
  my %genes;
  my $test = 0;
  my $lastgenename="";
  my $cur_gene_is_valid=1;
  my $msg="Checking $file...";

  my $ATCGNoccurences=0; # for checking if faa or fna file -> minimum 50% ATCGN content for fna and maximal 80% for faa files
  my $genelength=0; # for checking if faa or fna file -> minimum genelength for checking is 50

  if (defined($_[0])) {$test = 1;}  # if no ID Hash is give, we do not want to test but to fetch the gff data

  my %isoform_mapping_ncbiuniprot_correction; # the ncbi uniprot isoforms are not correspoding to the correct ID !
  
  my $found_comma_in_file=0;

  my $did_found_emptyline=0;
  if (!-e $file)    {&Error("File '$file' not found!");}
  open(FASTA,"<$file") || &Error("Could not open '$file': $!");
  while (<FASTA>) {
    my $curLine=$_;$curLine=~s/\r//g;
    if ($curLine =~/^ *$/){$did_found_emptyline=1;}
    elsif ($curLine =~ /^[^a-z#>]/i){&Error("\nERROR found line with forbidden symbols in '$file':\n$_\n"); exit 1}
    if ($curLine =~ />/) { #head line -> gene name and description
      if($isoform eq "uniprot"){
        lock(%isoform_mapping);
        if($curLine =~ m/isoform of ([^ ,]+)/i){
          my $iso = $1;
          $curLine =~ s/[\r\n]+$//;#chomp only removes last \n newline, now also \r are removed and all occurences
          $curLine =~ s/^>//;
          $curLine =~ s/\s.*//;
          $isoform_mapping{$file." ".$curLine} = $iso;

          if($debug){$msg.="found isoform '$file $curLine' => '$iso'\n";}
        }
        $curLine =~ s/[\r\n]+$//;#chomp only removes last \n newline, now also \r are removed and all occurences
        $curLine =~ s/^>//;
        $curLine =~ s/\s.*//;
        $isoform_mapping_ncbiuniprot_correction{$file." ".&convertUniprotAndNCBI($curLine)}=$file." ".$curLine;

      }elsif($isoform eq "trinity"){
        lock(%isoform_mapping);

        if($curLine =~ m/^([^ ]+)_i[0-9]+([^0-9].*|$)/){
          my $iso = $1;
          $curLine =~ s/[\r\n]+$//;#chomp only removes last \n newline, now also \r are removed and all occurences
          $curLine =~ s/^>//;
          $curLine =~ s/\s.*//;
          $iso =~ s/^>//;
          $isoform_mapping{$file." ".$curLine}=$iso;

          if($debug){$msg.="found isoform '$file $curLine' => '$iso'\n";}
        }
      }elsif($isoform eq "ncbi"){
        lock(%isoform_mapping);
        if($curLine =~ m/gene:([^ ]+).*isoform/i || $curLine =~ m/isoform.*Acc:MGI:([0-9])/i){
          my $iso = $1;
          $curLine =~ s/[\r\n]+$//;#chomp only removes last \n newline, now also \r are removed and all occurences
          $curLine =~ s/^>//;
          $curLine =~ s/\s.*//;
          $isoform_mapping{$file." ".$curLine} = $iso;

          if($debug){$msg.="found isoform '$file $curLine' => '$iso'\n";}
        }
        $curLine =~ s/[\r\n]+$//;#chomp only removes last \n newline, now also \r are removed and all occurences
        $curLine =~ s/^>//;
        $curLine =~ s/\s.*//;
        $isoform_mapping_ncbiuniprot_correction{$file." ".$curLine}=$file." ".$curLine;
      }

      if(!$force && $checkfasta && exists($allowedAlphabet->{$blastmode}) && $cur_gene_is_valid<1){last;}
      if(!$force && $checkfasta && exists($allowedAlphabet->{$blastmode}) && ($genelength>50 && ( ($allowedAlphabet->{$blastmode} eq "n" && $ATCGNoccurences/$genelength < 0.5) || ($allowedAlphabet->{$blastmode} eq "a" && $ATCGNoccurences/$genelength > 0.8)))){$cur_gene_is_valid= -1;last;}

      if( $blastmode eq "autoblast" && 
        !exists $autoblast_fileis{$abs_path_file} && $genelength>50 ){
        if( $genelength>50 && $ATCGNoccurences/$genelength < 0.5 ){
          $autoblast_fileis{$abs_path_file} = "prot";
        }elsif( $ATCGNoccurences/$genelength > 0.8 ){
          $autoblast_fileis{$abs_path_file} = "nucl";
        }
      }
      
      {
        lock(%gene_counter);
        $gene_counter{$file}++;
      }
      $curLine =~ s/[\r\n]+$//;#chomp only removes last \n newline, now also \r are removed
      $curLine =~ s/^>//;
      $curLine =~ s/\s.*//;
      $lastgenename=$curLine;
      if ($test && $isoform eq "") { # disable for -isoform
        if (defined($ids{$curLine}))  {&Error("Gene ID '$curLine' is defined at least twice in $file, please make sure that the identifier are unique !");}
        $ids{$curLine} = $file;
      }
      if(index($curLine, ",") != -1 && $file=~/_clean/){
        &Error("\n$ORANGE [ERROR]$NC input '$file' contains a gene-name with a comma, this causes problems with the proteinortho.tsv output, I cannot clean the file, is the input gzipped?\nThe line is:\n$curLine\n");
        exit 1;
      }elsif(index($curLine, ",") != -1){
        # 6.0.32 : check if gene name contains a comma -> this will cause problems with the proteinortho.tsv output (gene cluster speparator)
        my $file_clean = $file;
        $file_clean=~s/(\.[^.]+)$/_clean$1/;
        $found_comma_in_file=1;
        $msg.="\n$ORANGE [WARNING]$NC input '$file' contains a gene-name with a comma, this causes problems with the proteinortho.tsv output, I will clean the file ('$file_clean') and restart the analysis !!!\nThe line is:\n$curLine\n";
        last;
      }
      if ($synteny) {$genes{$curLine} = 1;}

      $cur_gene_is_valid=1;
      $ATCGNoccurences=0;
      $genelength=0;

    }else{
      $_ =~ s/[\r\n]+$//;#chomp only removes last \n newline, now also \r are removed and all occurences
      if(!$force && $checkfasta && exists($allowedAlphabet->{$blastmode})){ #test if the current gene is valid or not (for blastalgorithms that require either amino or nucleotide sequences only)
        if($allowedAlphabet->{$blastmode} eq "a"){ #"a"= aminoacid sequence
          if( $_ =~ /[^$aminoAlphabet]/){
            $cur_gene_is_valid=0;
          }
        }elsif($_ =~ /[^$nucleotideAlphabet]/){
            $cur_gene_is_valid=0;
        }
      }
      if($blastmode eq "autoblast" || !$force && $checkfasta && exists($allowedAlphabet->{$blastmode})){ 
        $ATCGNoccurences += ($_ =~ tr/AaTtUuGgCcNn//);
        $genelength += length($_);
      }

      my($filename, $dir, $ext) = fileparse($file);
      if($blastmode eq "diamond"){ # test for -sensitive option (gene with length >40)
        lock(%max_gene_length_diamond);
        if(exists $max_gene_length_diamond{$filename}){
          if(length($_)>$max_gene_length_diamond{$filename}){
            $max_gene_length_diamond{$filename}=length($_)+0;
          }
        }else{
          $max_gene_length_diamond{$filename}=length($_)+0;
        }
      }
    }
  }
  close(FASTA);

  if($found_comma_in_file && !$from_synteny_call){
    # 6.0.32 : replace the , with ; -> write output in a *_clean.* file
    my $file_clean = $file; #abs_path $file;
    $file_clean=~s/(\.[^.]+)$/_clean$1/;
    open(FASTA,"<$file") || &Error("Could not open '$file': $!");
    open(FASTA_CLEAN,">$file_clean") || &Error("Could not open '$file_clean': $!");
    while (<FASTA>) {
      $_=~s/[\n\r]+$//g;
      if(/^>/){
        my @arr = split(" ",$_);
        $arr[0]=~s/,/;/g;
        print FASTA_CLEAN join(" ",@arr)."\n";
      }elsif($_ ne ""){
        print FASTA_CLEAN "$_\n";
      }
    }
    close(FASTA);
    close(FASTA_CLEAN);
    if($step==2){
      $msg.=("\n$ORANGE [WARNING]$NC Restarting the indices generation.$NC");
      &generate_indices($file_clean);
    }
    $_ eq $file and $_ = $file_clean for @files;
    $_ eq $file and $_ = $file_clean for @files_cleanup;
    #goto RESTART_check_files;
    $check_files_QUEUE->enqueue($file_clean);
  }

  if($isoform eq "uniprot" || $isoform eq "ncbi"){
    lock(%isoform_mapping);
    foreach my $key (keys %isoform_mapping){
      if(exists $isoform_mapping_ncbiuniprot_correction{$isoform_mapping{$key}}){
        if($debug){$msg.="corrected isoform '$isoform_mapping{$key}' => '$isoform_mapping_ncbiuniprot_correction{$isoform_mapping{$key}}'\n";}
        $isoform_mapping{$key}=$isoform_mapping_ncbiuniprot_correction{$isoform_mapping{$key}};
      }
    }
  }
  if($blastmode eq "autoblast" && 
    !exists $autoblast_fileis{$abs_path_file} && $genelength>50 ){
    if( $genelength>50 && $ATCGNoccurences/$genelength < 0.5 ){
      $autoblast_fileis{$abs_path_file} = "prot";
    }elsif( $ATCGNoccurences/$genelength > 0.8 ){
      $autoblast_fileis{$abs_path_file} = "nucl";
    }
  }

  if($did_found_emptyline){
    $msg.=("$ORANGE [WARNING]$NC Found empty line in $file, removing it with perl -lne.$NC");
    system('perl -lne \'if($_ !~/^ *$/){print "$_";}\' '.$file.' >'.$file.'.tmp');
    system('mv '.$file.'.tmp '.$file);
    if($step==2){
      $msg.=("$ORANGE [WARNING]$NC Restarting the indices generation.$NC");
      &generate_indices($file);
    }
  }

  if(!$force && $checkfasta && $cur_gene_is_valid==1 && exists($allowedAlphabet->{$blastmode}) && ($genelength>50 && ( ($allowedAlphabet->{$blastmode} eq "n" && $ATCGNoccurences/$genelength < 0.5) || ($allowedAlphabet->{$blastmode} eq "a" && $ATCGNoccurences/$genelength > 0.8)))){$cur_gene_is_valid= -1;}
  if(!$force && $checkfasta && exists($allowedAlphabet->{$blastmode}) ){

    if($allowedAlphabet->{$blastmode} eq "n" && $cur_gene_is_valid<1 ){

      if($cur_gene_is_valid==-1){
       $msg.=("\n$ORANGE [WARNING]$NC The occurences of nucleotide characters (ATCGN) is less than 50% of in input fasta file '".$file."' in entry '".$lastgenename."'. $blastmode expects nucleotide characters...$NC");
      }else{
        $msg.=("\n$ORANGE [WARNING]$NC Found forbidden non-nucleotide character in input fasta file '".$file."' in entry '".$lastgenename."'. $blastmode expects nucleotide characters...$NC");
      }
      if( exists($blastmode_pendant->{$blastmode}) && $restart_counter==0 && $step <2){ # only for step = 0 and step 1 you can do a rerun else the DB are missing
        $blastmode = $blastmode_pendant->{$blastmode};
        $msg.=("\n!!!\n[WARNING]$NC Switching now to $blastmode and restarting...\n");
        $msg.="\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
        sleep 10;
        $msg.="\nWell then, proceeding...\n\n";
        goto RESTART;
      }

      &Error("\nThe algorithm (-p=$blastmode) does not support the given input files (use --force to skip this behaviour)...");

    }elsif($allowedAlphabet->{$blastmode} eq "a" && $cur_gene_is_valid<1 ){

      if($cur_gene_is_valid==-1){
        $msg.=("\n$ORANGE [WARNING]$NC The occurences of nucleotide characters (ATCGN) is greater than 80% of '".$file."' in entry '".$lastgenename."'. $blastmode expects aminoacid characters...$NC");
      }else{
        $msg.=("\$ORANGE [WARNING]$NC Found forbidden non-aminoacid character in input fasta file '$file' in entry '$lastgenename'. $blastmode expects aminoacid characters$NC");
      }

      if(exists($blastmode_pendant->{$blastmode}) && $restart_counter==0 && $step <2){ # only for step = 0 and step 1 you can do a rerun else the DB are missing
        $blastmode = $blastmode_pendant->{$blastmode};
        $msg.=("\n!!!\n[WARNING]$NC Switching now to $blastmode and restarting...\n");
      $msg.="\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n!!!\n";
       sleep 10;
      $msg.="\nWell then, proceeding...\n\n";
        goto RESTART;
      }

      &Error("\nThe algorithm (-p=$blastmode) does not support the given input files (use --force to skip this behaviour)...");
    }
  }

  if ($blastmode eq "autoblast") {$msg .= " (".$autoblast_fileis{$abs_path_file}.") ";}
  $msg.=" ok\n";

  if($verbose){print STDERR $msg}
  if($debug){print STDERR "read_details($file):done\n"}
  #$log_QUEUE->enqueue($msg);
  unless ($synteny) {return;}

  my %coordinates;
  if ($verbose && $test) {$msg.="$file\t".scalar(keys %genes)." genes\t";}
  my $gff = &gff4fasta($file);
  open(GFF,"<$gff") || &Error("Could not open '$gff': $!");
  while (<GFF>) {
    if ($_ =~ /^##FASTA/) {last;} # deal with prokka gffs, thx 2 Ben Woodcroft
    if ($_ =~ /^#/) {next;}
    $_=~s/[\r\n]+$//g;
    # e.g. NC_009925.1  RefSeq  CDS 9275  10096 . - 0 ID=cds8;Name=YP_001514414.1;Parent=gene9;Dbxref=Genbank:YP_001514414.1,GeneID:5678848;gbkey=CDS;product=signal peptide peptidase SppA;protein_id=YP_001514414.1;transl_table=11
    my @col = split(/\t+/,$_);
    if(scalar @col <9){next}
    if ($col[2] ne "CDS") {next;}
    if ($col[8] =~ /(?:Name|protein_id)=([^;]+)($|;)/i) {
      my $gene_nam=$1;
      $gene_nam =~ s/,/;/g; # 6.0.32 ,->;
      if(defined($genes{$gene_nam})){
        delete $genes{$gene_nam};
        if (!$test && $col[6] eq "+") {$coordinates{$gene_nam} = "$col[0]\t$col[6]\t$col[3]";} # store
        if (!$test && $col[6] eq "-") {$coordinates{$gene_nam} = "$col[0]\t$col[6]\t$col[4]";} # store
        next;
      }
    }elsif ($col[8] =~ /ID=([^;]+)($|;)/i) {
      my $gene_nam=$1;
      $gene_nam =~ s/,/;/g; # 6.0.32 ,->;
      if(defined($genes{$gene_nam})){
        delete $genes{$gene_nam};
        if (!$test && $col[6] eq "+") {$coordinates{$gene_nam} = "$col[0]\t$col[6]\t$col[3]";} # store
        if (!$test && $col[6] eq "-") {$coordinates{$gene_nam} = "$col[0]\t$col[6]\t$col[4]";} # store
      }
    }
  }
  close(GFF);

  if (scalar(keys %genes)) {
    my @tmp = keys %genes;
    my $joined=join(",",@tmp);
    if(length $joined > 100){$joined=substr($joined,0,100)."..."}
    &Error("No coordinate found for ".(scalar @tmp)." gene(s): '$joined'\nusing '$gff' and '$file'\n\nPlease make sure that the gff file corresponds to the input fasta files. Each protein with the ID 'XXX' needs to be refered in the gff file with the attribute 'Name=XXX'\nFor more information see 'https://gitlab.com/paulklemm_PHD/proteinortho#poff'");
  }

  if (!$test) {return \%coordinates;}   # store
}

sub TypicalBlastError {
  my $blastmode=shift;
  my $command=shift;
  $command=~s/2>\/dev\/null//g;
  my $retcode=shift;
  my $retmsg=shift;
  &Error("$NC".$blastmode." failed\n${ORANGE}return error code$NC = $retcode\n${ORANGE}full error command$NC = $command\n${ORANGE}return error message$NC = $retmsg")
  # The most common sources of this error are:\n- no space left on device error.\n- outdated $blastmode, please update $blastmode or consider another -p algorithm.\n- the databases are missing. Maybe you ran --step=1 and moved the databases afterwards? Please rerun 'proteinortho --step=1 --force /path/to/fastas'\n- maybe the fasta files contain mixed nucleotide and aminoacid sequences or are not suited for $blastmode? (For example diamond only processes protein sequences) Try 'proteinortho --step=1 --check --force /path/to/fastas'.
}
sub Error {
  $debug=1;
  if($_[0] ne "I need at least two files to compare something!"){print STDERR "\n";print STDERR &get_parameter;}

  print STDERR "\n\n$RED"."[Error]$NC $ORANGE ".$_[0]." $NC \n\n";

  if($_[0] ne "I need at least two files to compare something!"){print STDERR "Please visit the proteinortho-wiki, where the most common errors are documented:\nhttps://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Codes\n\nIf you cannot solve this error, please file a report (including the input files, the error code and the above 'Parameter-vector'):\nincoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com\n\nFurther more all mails to lechner\@staff.uni-marburg.de are welcome.\n\n\n";}

  &reset_locale();
  if (!$keep && $tmp_path =~ m/\/proteinortho_cache_[^\/]+\d*\/$/ && $step!=1 ){system("rm -r $tmp_path >/dev/null 2>&1");}
  exit 1;
}

# Remove .fasta/.faa etc. and change it to .gff
## Update 6.01: Test for additional naming schemes
sub gff4fasta {
  my $ori_gff = shift;
  my $gff="";

  $gff=$ori_gff;
  $gff =~ s/\.[^.]+$/.gff/;
  if (-e $gff) {return $gff;}
  if (-e $gff."3") {return $gff."3";}

  $gff=$ori_gff;
  $gff =~ s/\.[^.]+$/.gtf/;
  if (-e $gff) {return $gff;}
  if (-e $gff."3") {return $gff."3";}

  $gff=$ori_gff;
  $gff =~ s/_clean\.[^.]+$/.gff/;
  if (-e $gff) {return $gff;}
  if (-e $gff."3") {return $gff."3";}

  $gff=$ori_gff;
  $gff =~ s/_clean\.[^.]+$/.gtf/;
  if (-e $gff) {return $gff;}
  if (-e $gff."3") {return $gff."3";}

  $gff=$ori_gff;
  my $ncbi_gff = $gff;
  if ($ncbi_gff =~ s/_cds_from//) {
    if (-e $ncbi_gff) {return $ncbi_gff;}
  }

  $gff=$ori_gff;
  $ncbi_gff = $gff;
  if ($ncbi_gff =~ s/_protein/_genomic/) {
    if (-e $ncbi_gff) {return $ncbi_gff;}
  }

  return $gff;
}

sub get_po_path {
  my @fileparse0 = fileparse($0); # path to the C++-part of this program

  my $uname=`uname -s`;
  $uname=~s/[\r\n]+$//;
  $uname.="_".`uname -m`;
  $uname=~s/[\r\n]+$//;

  if(!-x $fileparse0[1]."/proteinortho_clustering"){
    if(-x $fileparse0[1]."/src/BUILD/$uname/proteinortho_clustering"){
      $fileparse0[1]=$fileparse0[1]."/src/BUILD/$uname";
      if($debug){print STDERR "Detected ".$fileparse0[1]."\n";}
    }elsif(-x "/usr/bin/proteinortho_clustering"){
      $fileparse0[1]="/usr/bin/";
      if($debug){print STDERR "Detected ".$fileparse0[1]."\n";}
    }elsif(-x "/usr/local/bin/proteinortho_clustering"){
      $fileparse0[1]="/usr/local/bin/";
      if($debug){print STDERR "Detected ".$fileparse0[1]."\n";}
    }elsif(-x "$binpath/proteinortho_clustering"){
      $fileparse0[1]="$binpath/";
      if($debug){print STDERR "Detected ".$fileparse0[1]."\n";}
    }else{
      my $p=`which proteinortho_clustering`;
      $p=~s/^([^ ]+)\/proteinortho_clustering.*$/$1/;
      chomp($p);
      $fileparse0[1]=$p;
      if($debug){print STDERR "Detected (PATH enviroment variable)\n";}
    }
  }

  if(!-x $fileparse0[1]."/proteinortho_clustering"){
    &Error("cannot find 'proteinortho_clustering'$NC in: the current directory '.', ./src/, ./src/BUILD/$uname , /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
    exit 1;
  }
  # if(!-x "$fileparse0[1]/proteinortho_cleanupblastgraph"){
  #   &Error("cannot find proteinortho_cleanupblastgraph in $fileparse0[1].\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n";
  #   exit 0;
  # }
  # if(!-x "$fileparse0[1]/po_tree"){
  #   &Error("cannot find po_tree in $fileparse0[1].\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n";
  #   exit 0;
  # }
  if(!-x $fileparse0[1]."/proteinortho2html.pl"){
   &Error("cannot find 'proteinortho2html.pl'$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
    exit 1;
  }
  # if(!-x "$fileparse0[1]/proteinortho2tree.pl"){
  #   &Error("cannot find proteinortho2tree.pl in $fileparse0[1].\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n";
  #   exit 0;
  # }
  if(!-x $fileparse0[1]."/proteinortho_ffadj_mcs.py" && $synteny){
    &Error("cannot find 'proteinortho_ffadj_mcs.py'$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
    exit 1;
  }
  if(!-x $fileparse0[1]."/proteinortho_singletons.pl"){
    &Error("cannot find 'proteinortho_singletons.pl'$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
    exit 1;
  }
  if(!-x $fileparse0[1]."/proteinortho_graphMinusRemovegraph"){
    &Error("cannot find 'proteinortho_graphMinusRemovegraph'$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
    exit 1;
  }
  if(!-x $fileparse0[1]."/proteinortho_do_mcl.pl"){
    &Error("cannot find 'proteinortho_do_mcl.pl'$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
      exit 1;
  }
  if(!-x $fileparse0[1]."/proteinortho2xml.pl"){
    &Error("cannot find 'proteinortho2xml.pl'$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, \$PATH, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
    exit 1;
  }
  return $fileparse0[1];
}

sub edgeweight {
  # 1e-10 = 0.15, 1e-20 = 0.3, 1e-40 = 0.6, 1e-66+ = 1.0
  if ($_[0] == 0) {return 1;}
  my $x = -1*&log10($_[0])/100*1.5;
  if ($x > 1) {return 1;}
  if ($x <= 0) {return 0.0001;}
  return $x;
}

# sub log10 {
#   return log($_[0])/log(10);
# }

sub write_descriptions {
  if($verbose){print STDERR "Writing sequence descriptions\n";}
  open DESC, '>', $desctable;
  foreach my $file (@files) {
    if ($verbose) {print STDERR "Extracting descriptions from '$file'\t(".$gene_counter{$file}." entries)\n";}
    open FASTA, '<', $file;
    while (<FASTA>) {
      $_=~s/[\r\n]+$//;
      if (m/^>(\S+)(\s+(.*))?$/) {
        print DESC $1, "\t", ($3 || "unannotated sequence"), "\n";
      }
    }
  }
  if($verbose){print STDERR "[OUTPUT] -> written to $desctable\n";}

}

&reset_locale();
