#include <array>#include <cstring>#include <iostream>#include <chrono>#include <iomanip>#include <fstream>#include <vector>#include <algorithm>#include <stdexcept>#include <omp.h>#include <immintrin.h>#include <mpi.h>#include <sstream>#include "EDNAFULL.h"#include "EBLOSUM62.h"#include <unordered_map>#include <filesystem>#include <climits>Data Structures | |
| struct | AffineDPScores |
| struct | Loc |
Macros | |
| #define | RESET "\033[0m" |
| ANSI escape code: reset color. | |
| #define | GREEN "\033[32m" |
| ANSI escape code: green. | |
| #define | RED "\033[31m" |
| ANSI escape code: red. | |
| #define | CYAN "\033[36m" |
| ANSI escape code: cyan. | |
Typedefs | |
| using | ScoreFn = int(*)(char, char) |
Enumerations | |
| enum | ScoreMode { MODE_DNA , MODE_PROTEIN } |
Functions | |
| int | edna_score (char x, char y) |
| Lookup the score between two characters based on the selected mode. | |
| int | blosum62_score (char x, char y) |
| Lookup the score between two characters based on the selected mode. | |
| int | score (char x, char y, ScoreMode mode) |
| Lookup the score between two characters based on the selected mode. | |
| void | showProgressBar (int progress, int total) |
| Show a progress bar in the console. | |
| std::string | getAccession (const std::string &header, ScoreMode mode) |
| Extract the accession number from a FASTA header line. | |
| std::string | getGeneSymbol (const std::string &header, ScoreMode mode) |
| Extract the gene symbol from a FASTA header. | |
| void | processFasta (const string &filename, string &header, string &sequence) |
| Read the first record from a FASTA file. | |
| void | savePlainAlignment (const std::string &header1, const std::string &header2, const std::string &aligned1, const std::string &aligned2, std::ostream &os) |
| Write two aligned sequences in FASTA format. | |
| void | saveLCS (const std::string &id, const std::string &lcs_str, std::ostream &os) |
| Write the LCS (Longest Common Subsequence) in FASTA format. | |
| void | printColoredAlignment (const std::string &seq1_aln, const std::string &seq2_aln, std::ostream &os=std::cout) |
| Print two aligned sequences with color coding. | |
| void | writeRawDPMatrix (const std::vector< std::vector< int > > &dp, const std::string &filename) |
| Write the DP matrix to a file. | |
| void | writeDPMatrix (const std::vector< std::vector< int > > &dp, const std::string &filename) |
| Write the DP matrix to a binary file. | |
| void | writeRawCharMatrix (const std::vector< std::vector< char > > &mat, const std::string &filename) |
| Write a character matrix to a file. | |
| void | writeCharMatrix (const std::vector< std::vector< char > > &mat, const std::string &filename) |
| Write a character matrix to a binary file. | |
| void | initAffineDP (int n, vector< int > &prev_row, vector< int > &prev_gapX, vector< int > &prev_gapY, bool isGlobal) |
| Initialize the DP structures for affine gap scoring. | |
| void | computeAffineDPRow (int i, const string &x, const string &y, vector< int > &prev_row, vector< int > &prev_gapX, vector< int > &prev_gapY, vector< int > &curr_row, vector< int > &curr_gapX, vector< int > &curr_gapY, vector< char > &curr_trace_row, ScoreFn score_fn) |
| Compute a single row of the affine DP matrix. | |
| void | globalalign (const string &x, const string &y, const string &header1, const string &header2, const std::string &outdir, ScoreMode mode, ScoreFn score_fn) |
| Perform global sequence alignment using the Needleman-Wunsch algorithm. | |
| AffineDPScores | compute_local_affine_cell (int s_diag_prev, int e_diag_prev, int f_diag_prev, int s_left, int e_left, int s_up, int f_up, char char_x, char char_y, ScoreFn score_fn) |
| Compute a single cell in the local affine DP matrix. | |
| void | localalign (const std::string &x, const std::string &y, const std::string &header1, const std::string &header2, const std::string &outdir, ScoreMode mode, ScoreFn score_fn) |
| Perform local sequence alignment using the Smith-Waterman algorithm. | |
| void | lcs (const string &x, const string &y, const string &header1, const string &header2, const std::string &outdir, ScoreMode mode) |
| Compute the Longest Common Subsequence (LCS) of two sequences. | |
| int | main (int argc, char **argv) |
| Main function to run the MPI-based sequence alignment tool. | |
Variables | |
| bool | verbose = false |
| bool | binary = false |
| bool | txt = false |
| double | GAP_OPEN = -5.0 |
| Gap penalty. | |
| double | GAP_EXTEND = -1.0 |
| #define CYAN "\033[36m" |
ANSI escape code: cyan.
| #define GREEN "\033[32m" |
ANSI escape code: green.
| #define RED "\033[31m" |
ANSI escape code: red.
| #define RESET "\033[0m" |
ANSI escape code: reset color.
| using ScoreFn = int(*)(char,char) |
| enum ScoreMode |
|
inline |
Lookup the score between two characters based on the selected mode.
| x | First character (base or amino acid). |
| y | Second character (base or amino acid). |
| AffineDPScores compute_local_affine_cell | ( | int | s_diag_prev, |
| int | e_diag_prev, | ||
| int | f_diag_prev, | ||
| int | s_left, | ||
| int | e_left, | ||
| int | s_up, | ||
| int | f_up, | ||
| char | char_x, | ||
| char | char_y, | ||
| ScoreFn | score_fn | ||
| ) |
Compute a single cell in the local affine DP matrix.
This function computes the scores for a single cell in the local alignment DP matrix using affine gap penalties. It returns an AffineDPScores struct containing the scores and the pointer for traceback.
| s_diag_prev | S value from the diagonal cell (i-1, j-1) |
| e_diag_prev | E value from the diagonal cell (i-1, j-1) |
| f_diag_prev | F value from the diagonal cell (i-1, j-1) |
| s_left | S value from the left cell (i, j-1) |
| e_left | E value from the left cell (i, j-1) |
| s_up | S value from the upper cell (i-1, j) |
| f_up | F value from the upper cell (i-1, j) |
| char_x | Character from sequence X at position i |
| char_y | Character from sequence Y at position j |
| score_fn | Scoring function for match/mismatch |
| void computeAffineDPRow | ( | int | i, |
| const string & | x, | ||
| const string & | y, | ||
| vector< int > & | prev_row, | ||
| vector< int > & | prev_gapX, | ||
| vector< int > & | prev_gapY, | ||
| vector< int > & | curr_row, | ||
| vector< int > & | curr_gapX, | ||
| vector< int > & | curr_gapY, | ||
| vector< char > & | curr_trace_row, | ||
| ScoreFn | score_fn | ||
| ) |
Compute a single row of the affine DP matrix.
This function computes the i-th row of the DP matrix for affine gap scoring, including match/mismatch scores and gap scores in both directions.
| i | The current row index (1-based). |
| x | The first sequence (string). |
| y | The second sequence (string). |
| prev_row | Previous row S_{i-1}. |
| prev_gapX | Previous gap row F_{i-1} (gap in Y). |
| prev_gapY | Previous gap row E_{i-1} (gap in X). |
| curr_row | Current row S_i to be filled. |
| curr_gapX | Current gap row F_i to be filled. |
| curr_gapY | Current gap row E_i to be filled. |
| curr_trace_row | Pointer/Trace for S_i to be filled. |
| score_fn | Scoring function for match/mismatch. |
|
inline |
Lookup the score between two characters based on the selected mode.
| x | First character (base or amino acid). |
| y | Second character (base or amino acid). |
| std::string getAccession | ( | const std::string & | header, |
| ScoreMode | mode | ||
| ) |
Extract the accession number from a FASTA header line.
The accession number is the first word in the header line, which is expected to start with a >.
| header | The header line from a FASTA file. |
| std::string getGeneSymbol | ( | const std::string & | header, |
| ScoreMode | mode | ||
| ) |
Extract the gene symbol from a FASTA header.
For DNA headers, looks for the first pair of parentheses “(GENE)” and returns GENE. For protein headers, takes the part after the second ‘|’ up to the first underscore.
| header | The FASTA header line (without the leading '>'). |
| mode | MODE_DNA or MODE_PROTEIN. |
| void globalalign | ( | const string & | x, |
| const string & | y, | ||
| const string & | header1, | ||
| const string & | header2, | ||
| const std::string & | outdir, | ||
| ScoreMode | mode, | ||
| ScoreFn | score_fn | ||
| ) |
Perform global sequence alignment using the Needleman-Wunsch algorithm.
This function computes the global alignment of two sequences using affine gap scoring. It initializes the DP matrix, fills it row by row, and performs traceback to get the aligned sequences.
| x | The first sequence (string). |
| y | The second sequence (string). |
| header1 | Header for the first sequence. |
| header2 | Header for the second sequence. |
| outdir | Output directory for results. |
| mode | Scoring mode (MODE_DNA or MODE_PROTEIN). |
| score_fn | Scoring function for match/mismatch. |
| void initAffineDP | ( | int | n, |
| vector< int > & | prev_row, | ||
| vector< int > & | prev_gapX, | ||
| vector< int > & | prev_gapY, | ||
| bool | isGlobal | ||
| ) |
Initialize the DP structures for affine gap scoring.
| n | Length of the second sequence (Y). |
| prev_row | Row i-1, match/mismatch scores. |
| prev_gapX | Row i-1, gap scores in X. |
| prev_gapY | Row i-1, gap scores in Y. |
| isGlobal | Whether this is a global alignment (affects initialization). |
| void lcs | ( | const string & | x, |
| const string & | y, | ||
| const string & | header1, | ||
| const string & | header2, | ||
| const std::string & | outdir, | ||
| ScoreMode | mode | ||
| ) |
Compute the Longest Common Subsequence (LCS) of two sequences.
This function computes the LCS using dynamic programming and saves the results.
| x | The first sequence (string). |
| y | The second sequence (string). |
| header1 | Header for the first sequence. |
| header2 | Header for the second sequence. |
| outdir | Output directory for results. |
| mode | Scoring mode (MODE_DNA or MODE_PROTEIN). |
| void localalign | ( | const std::string & | x, |
| const std::string & | y, | ||
| const std::string & | header1, | ||
| const std::string & | header2, | ||
| const std::string & | outdir, | ||
| ScoreMode | mode, | ||
| ScoreFn | score_fn | ||
| ) |
Perform local sequence alignment using the Smith-Waterman algorithm.
This function computes the local alignment of two sequences using affine gap scoring. It initializes the DP matrix, fills it row by row, and performs traceback to get the aligned sequences.
| x | The first sequence (string). |
| y | The second sequence (string). |
| header1 | Header for the first sequence. |
| header2 | Header for the second sequence. |
| outdir | Output directory for results. |
| mode | Scoring mode (MODE_DNA or MODE_PROTEIN). |
| score_fn | Scoring function for match/mismatch. |
| int main | ( | int | argc, |
| char ** | argv | ||
| ) |
Main function to run the MPI-based sequence alignment tool.
This function initializes MPI, parses command line arguments, reads input files, and performs sequence alignments based on user choices.
| argc | Number of command line arguments. |
| argv | Array of command line arguments. |
| void printColoredAlignment | ( | const std::string & | seq1_aln, |
| const std::string & | seq2_aln, | ||
| std::ostream & | os = std::cout |
||
| ) |
Print two aligned sequences with color coding.
Matches are printed in green, mismatches in cyan, and gaps in red. The alignment is printed in blocks of LINE_WIDTH characters.
| seq1_aln | Aligned sequence 1 (with gaps). |
| seq2_aln | Aligned sequence 2 (with gaps). |
| os | Output stream (default is std::cout). |
| void processFasta | ( | const string & | filename, |
| string & | header, | ||
| string & | sequence | ||
| ) |
Read the first record from a FASTA file.
Extracts the very first header (minus the leading >), and concatenates all subsequent lines into a single sequence string.
| filename | Path to the FASTA file. | |
| [out] | header | On return, the header line without leading >. |
| [out] | sequence | On return, the full sequence (no newlines). |
| std::runtime_error | if the file cannot be opened. |
| void saveLCS | ( | const std::string & | id, |
| const std::string & | lcs_str, | ||
| std::ostream & | os | ||
| ) |
Write the LCS (Longest Common Subsequence) in FASTA format.
| id | Identifier for the LCS (no leading '>'). |
| lcs_str | The LCS string. |
| os | Output stream (e.g. ofstream). |
| void savePlainAlignment | ( | const std::string & | header1, |
| const std::string & | header2, | ||
| const std::string & | aligned1, | ||
| const std::string & | aligned2, | ||
| std::ostream & | os | ||
| ) |
Write two aligned sequences in FASTA format.
| header1 | Identifier for sequence 1 (no leading '>'). |
| header2 | Identifier for sequence 2 (no leading '>'). |
| aligned1 | Aligned sequence 1 (with gaps). |
| aligned2 | Aligned sequence 2 (with gaps). |
| os | Output stream (e.g. ofstream). |
|
inline |
Lookup the score between two characters based on the selected mode.
| x | First character (base or amino acid). |
| y | Second character (base or amino acid). |
| mode | Scoring mode (MODE_DNA or MODE_PROTEIN). |
| void showProgressBar | ( | int | progress, |
| int | total | ||
| ) |
Show a progress bar in the console.
*
| progress | Current progress (0 to total). |
| total | Total number of steps. |
| void writeCharMatrix | ( | const std::vector< std::vector< char > > & | mat, |
| const std::string & | filename | ||
| ) |
Write a character matrix to a binary file.
| mat | The character matrix (2D vector). |
| filename | The output filename. |
| void writeDPMatrix | ( | const std::vector< std::vector< int > > & | dp, |
| const std::string & | filename | ||
| ) |
Write the DP matrix to a binary file.
| dp | The DP matrix (2D vector). |
| filename | The output filename. |
| void writeRawCharMatrix | ( | const std::vector< std::vector< char > > & | mat, |
| const std::string & | filename | ||
| ) |
Write a character matrix to a file.
| mat | The character matrix (2D vector). |
| filename | The output filename. |
| void writeRawDPMatrix | ( | const std::vector< std::vector< int > > & | dp, |
| const std::string & | filename | ||
| ) |
Write the DP matrix to a file.
| dp | The DP matrix (2D vector). |
| filename | The output filename. |
| bool binary = false |
| double GAP_EXTEND = -1.0 |
| double GAP_OPEN = -5.0 |
Gap penalty.
| bool txt = false |
| bool verbose = false |