SequenceAligner
 
Loading...
Searching...
No Matches
main.cpp File Reference
#include <array>
#include <cstring>
#include <iostream>
#include <chrono>
#include <iomanip>
#include <fstream>
#include <vector>
#include <algorithm>
#include <stdexcept>
#include <omp.h>
#include <immintrin.h>
#include <mpi.h>
#include <sstream>
#include "EDNAFULL.h"
#include "EBLOSUM62.h"
#include <unordered_map>
#include <filesystem>
#include <climits>

Data Structures

struct  AffineDPScores
 
struct  Loc
 

Macros

#define RESET   "\033[0m"
 ANSI escape code: reset color.
 
#define GREEN   "\033[32m"
 ANSI escape code: green.
 
#define RED   "\033[31m"
 ANSI escape code: red.
 
#define CYAN   "\033[36m"
 ANSI escape code: cyan.
 

Typedefs

using ScoreFn = int(*)(char, char)
 

Enumerations

enum  ScoreMode { MODE_DNA , MODE_PROTEIN }
 

Functions

int edna_score (char x, char y)
 Lookup the score between two characters based on the selected mode.
 
int blosum62_score (char x, char y)
 Lookup the score between two characters based on the selected mode.
 
int score (char x, char y, ScoreMode mode)
 Lookup the score between two characters based on the selected mode.
 
void showProgressBar (int progress, int total)
 Show a progress bar in the console.
 
std::string getAccession (const std::string &header, ScoreMode mode)
 Extract the accession number from a FASTA header line.
 
std::string getGeneSymbol (const std::string &header, ScoreMode mode)
 Extract the gene symbol from a FASTA header.
 
void processFasta (const string &filename, string &header, string &sequence)
 Read the first record from a FASTA file.
 
void savePlainAlignment (const std::string &header1, const std::string &header2, const std::string &aligned1, const std::string &aligned2, std::ostream &os)
 Write two aligned sequences in FASTA format.
 
void saveLCS (const std::string &id, const std::string &lcs_str, std::ostream &os)
 Write the LCS (Longest Common Subsequence) in FASTA format.
 
void printColoredAlignment (const std::string &seq1_aln, const std::string &seq2_aln, std::ostream &os=std::cout)
 Print two aligned sequences with color coding.
 
void writeRawDPMatrix (const std::vector< std::vector< int > > &dp, const std::string &filename)
 Write the DP matrix to a file.
 
void writeDPMatrix (const std::vector< std::vector< int > > &dp, const std::string &filename)
 Write the DP matrix to a binary file.
 
void writeRawCharMatrix (const std::vector< std::vector< char > > &mat, const std::string &filename)
 Write a character matrix to a file.
 
void writeCharMatrix (const std::vector< std::vector< char > > &mat, const std::string &filename)
 Write a character matrix to a binary file.
 
void initAffineDP (int n, vector< int > &prev_row, vector< int > &prev_gapX, vector< int > &prev_gapY, bool isGlobal)
 Initialize the DP structures for affine gap scoring.
 
void computeAffineDPRow (int i, const string &x, const string &y, vector< int > &prev_row, vector< int > &prev_gapX, vector< int > &prev_gapY, vector< int > &curr_row, vector< int > &curr_gapX, vector< int > &curr_gapY, vector< char > &curr_trace_row, ScoreFn score_fn)
 Compute a single row of the affine DP matrix.
 
void globalalign (const string &x, const string &y, const string &header1, const string &header2, const std::string &outdir, ScoreMode mode, ScoreFn score_fn)
 Perform global sequence alignment using the Needleman-Wunsch algorithm.
 
AffineDPScores compute_local_affine_cell (int s_diag_prev, int e_diag_prev, int f_diag_prev, int s_left, int e_left, int s_up, int f_up, char char_x, char char_y, ScoreFn score_fn)
 Compute a single cell in the local affine DP matrix.
 
void localalign (const std::string &x, const std::string &y, const std::string &header1, const std::string &header2, const std::string &outdir, ScoreMode mode, ScoreFn score_fn)
 Perform local sequence alignment using the Smith-Waterman algorithm.
 
void lcs (const string &x, const string &y, const string &header1, const string &header2, const std::string &outdir, ScoreMode mode)
 Compute the Longest Common Subsequence (LCS) of two sequences.
 
int main (int argc, char **argv)
 Main function to run the MPI-based sequence alignment tool.
 

Variables

bool verbose = false
 
bool binary = false
 
bool txt = false
 
double GAP_OPEN = -5.0
 Gap penalty.
 
double GAP_EXTEND = -1.0
 

Macro Definition Documentation

◆ CYAN

#define CYAN   "\033[36m"

ANSI escape code: cyan.

◆ GREEN

#define GREEN   "\033[32m"

ANSI escape code: green.

◆ RED

#define RED   "\033[31m"

ANSI escape code: red.

◆ RESET

#define RESET   "\033[0m"

ANSI escape code: reset color.

Typedef Documentation

◆ ScoreFn

using ScoreFn = int(*)(char,char)

Enumeration Type Documentation

◆ ScoreMode

enum ScoreMode
Enumerator
MODE_DNA 
MODE_PROTEIN 

Function Documentation

◆ blosum62_score()

int blosum62_score ( char  x,
char  y 
)
inline

Lookup the score between two characters based on the selected mode.

Parameters
xFirst character (base or amino acid).
ySecond character (base or amino acid).
Returns
Score based on the selected scoring matrix.

◆ compute_local_affine_cell()

AffineDPScores compute_local_affine_cell ( int  s_diag_prev,
int  e_diag_prev,
int  f_diag_prev,
int  s_left,
int  e_left,
int  s_up,
int  f_up,
char  char_x,
char  char_y,
ScoreFn  score_fn 
)

Compute a single cell in the local affine DP matrix.

This function computes the scores for a single cell in the local alignment DP matrix using affine gap penalties. It returns an AffineDPScores struct containing the scores and the pointer for traceback.

Parameters
s_diag_prevS value from the diagonal cell (i-1, j-1)
e_diag_prevE value from the diagonal cell (i-1, j-1)
f_diag_prevF value from the diagonal cell (i-1, j-1)
s_leftS value from the left cell (i, j-1)
e_leftE value from the left cell (i, j-1)
s_upS value from the upper cell (i-1, j)
f_upF value from the upper cell (i-1, j)
char_xCharacter from sequence X at position i
char_yCharacter from sequence Y at position j
score_fnScoring function for match/mismatch
Returns
AffineDPScores containing computed scores and pointer

◆ computeAffineDPRow()

void computeAffineDPRow ( int  i,
const string &  x,
const string &  y,
vector< int > &  prev_row,
vector< int > &  prev_gapX,
vector< int > &  prev_gapY,
vector< int > &  curr_row,
vector< int > &  curr_gapX,
vector< int > &  curr_gapY,
vector< char > &  curr_trace_row,
ScoreFn  score_fn 
)

Compute a single row of the affine DP matrix.

This function computes the i-th row of the DP matrix for affine gap scoring, including match/mismatch scores and gap scores in both directions.

Parameters
iThe current row index (1-based).
xThe first sequence (string).
yThe second sequence (string).
prev_rowPrevious row S_{i-1}.
prev_gapXPrevious gap row F_{i-1} (gap in Y).
prev_gapYPrevious gap row E_{i-1} (gap in X).
curr_rowCurrent row S_i to be filled.
curr_gapXCurrent gap row F_i to be filled.
curr_gapYCurrent gap row E_i to be filled.
curr_trace_rowPointer/Trace for S_i to be filled.
score_fnScoring function for match/mismatch.

◆ edna_score()

int edna_score ( char  x,
char  y 
)
inline

Lookup the score between two characters based on the selected mode.

Parameters
xFirst character (base or amino acid).
ySecond character (base or amino acid).
Returns
Score based on the selected scoring matrix.

◆ getAccession()

std::string getAccession ( const std::string &  header,
ScoreMode  mode 
)

Extract the accession number from a FASTA header line.

The accession number is the first word in the header line, which is expected to start with a >.

Parameters
headerThe header line from a FASTA file.
Returns
The accession number (first word).

◆ getGeneSymbol()

std::string getGeneSymbol ( const std::string &  header,
ScoreMode  mode 
)

Extract the gene symbol from a FASTA header.

For DNA headers, looks for the first pair of parentheses “(GENE)” and returns GENE. For protein headers, takes the part after the second ‘|’ up to the first underscore.

Parameters
headerThe FASTA header line (without the leading '>').
modeMODE_DNA or MODE_PROTEIN.
Returns
The gene symbol, or an empty string on failure.

◆ globalalign()

void globalalign ( const string &  x,
const string &  y,
const string &  header1,
const string &  header2,
const std::string &  outdir,
ScoreMode  mode,
ScoreFn  score_fn 
)

Perform global sequence alignment using the Needleman-Wunsch algorithm.

This function computes the global alignment of two sequences using affine gap scoring. It initializes the DP matrix, fills it row by row, and performs traceback to get the aligned sequences.

Parameters
xThe first sequence (string).
yThe second sequence (string).
header1Header for the first sequence.
header2Header for the second sequence.
outdirOutput directory for results.
modeScoring mode (MODE_DNA or MODE_PROTEIN).
score_fnScoring function for match/mismatch.

◆ initAffineDP()

void initAffineDP ( int  n,
vector< int > &  prev_row,
vector< int > &  prev_gapX,
vector< int > &  prev_gapY,
bool  isGlobal 
)

Initialize the DP structures for affine gap scoring.

Parameters
nLength of the second sequence (Y).
prev_rowRow i-1, match/mismatch scores.
prev_gapXRow i-1, gap scores in X.
prev_gapYRow i-1, gap scores in Y.
isGlobalWhether this is a global alignment (affects initialization).

◆ lcs()

void lcs ( const string &  x,
const string &  y,
const string &  header1,
const string &  header2,
const std::string &  outdir,
ScoreMode  mode 
)

Compute the Longest Common Subsequence (LCS) of two sequences.

This function computes the LCS using dynamic programming and saves the results.

Parameters
xThe first sequence (string).
yThe second sequence (string).
header1Header for the first sequence.
header2Header for the second sequence.
outdirOutput directory for results.
modeScoring mode (MODE_DNA or MODE_PROTEIN).

◆ localalign()

void localalign ( const std::string &  x,
const std::string &  y,
const std::string &  header1,
const std::string &  header2,
const std::string &  outdir,
ScoreMode  mode,
ScoreFn  score_fn 
)

Perform local sequence alignment using the Smith-Waterman algorithm.

This function computes the local alignment of two sequences using affine gap scoring. It initializes the DP matrix, fills it row by row, and performs traceback to get the aligned sequences.

Parameters
xThe first sequence (string).
yThe second sequence (string).
header1Header for the first sequence.
header2Header for the second sequence.
outdirOutput directory for results.
modeScoring mode (MODE_DNA or MODE_PROTEIN).
score_fnScoring function for match/mismatch.

◆ main()

int main ( int  argc,
char **  argv 
)

Main function to run the MPI-based sequence alignment tool.

This function initializes MPI, parses command line arguments, reads input files, and performs sequence alignments based on user choices.

Parameters
argcNumber of command line arguments.
argvArray of command line arguments.
Returns
Exit status code.

◆ printColoredAlignment()

void printColoredAlignment ( const std::string &  seq1_aln,
const std::string &  seq2_aln,
std::ostream &  os = std::cout 
)

Print two aligned sequences with color coding.

Matches are printed in green, mismatches in cyan, and gaps in red. The alignment is printed in blocks of LINE_WIDTH characters.

Parameters
seq1_alnAligned sequence 1 (with gaps).
seq2_alnAligned sequence 2 (with gaps).
osOutput stream (default is std::cout).

◆ processFasta()

void processFasta ( const string &  filename,
string &  header,
string &  sequence 
)

Read the first record from a FASTA file.

Extracts the very first header (minus the leading >), and concatenates all subsequent lines into a single sequence string.

Parameters
filenamePath to the FASTA file.
[out]headerOn return, the header line without leading >.
[out]sequenceOn return, the full sequence (no newlines).
Exceptions
std::runtime_errorif the file cannot be opened.

◆ saveLCS()

void saveLCS ( const std::string &  id,
const std::string &  lcs_str,
std::ostream &  os 
)

Write the LCS (Longest Common Subsequence) in FASTA format.

Parameters
idIdentifier for the LCS (no leading '>').
lcs_strThe LCS string.
osOutput stream (e.g. ofstream).

◆ savePlainAlignment()

void savePlainAlignment ( const std::string &  header1,
const std::string &  header2,
const std::string &  aligned1,
const std::string &  aligned2,
std::ostream &  os 
)

Write two aligned sequences in FASTA format.

Parameters
header1Identifier for sequence 1 (no leading '>').
header2Identifier for sequence 2 (no leading '>').
aligned1Aligned sequence 1 (with gaps).
aligned2Aligned sequence 2 (with gaps).
osOutput stream (e.g. ofstream).

◆ score()

int score ( char  x,
char  y,
ScoreMode  mode 
)
inline

Lookup the score between two characters based on the selected mode.

Parameters
xFirst character (base or amino acid).
ySecond character (base or amino acid).
modeScoring mode (MODE_DNA or MODE_PROTEIN).
Returns
Score based on the selected scoring matrix.

◆ showProgressBar()

void showProgressBar ( int  progress,
int  total 
)

Show a progress bar in the console.

*

Parameters
progressCurrent progress (0 to total).
totalTotal number of steps.

◆ writeCharMatrix()

void writeCharMatrix ( const std::vector< std::vector< char > > &  mat,
const std::string &  filename 
)

Write a character matrix to a binary file.

Parameters
matThe character matrix (2D vector).
filenameThe output filename.

◆ writeDPMatrix()

void writeDPMatrix ( const std::vector< std::vector< int > > &  dp,
const std::string &  filename 
)

Write the DP matrix to a binary file.

Parameters
dpThe DP matrix (2D vector).
filenameThe output filename.

◆ writeRawCharMatrix()

void writeRawCharMatrix ( const std::vector< std::vector< char > > &  mat,
const std::string &  filename 
)

Write a character matrix to a file.

Parameters
matThe character matrix (2D vector).
filenameThe output filename.

◆ writeRawDPMatrix()

void writeRawDPMatrix ( const std::vector< std::vector< int > > &  dp,
const std::string &  filename 
)

Write the DP matrix to a file.

Parameters
dpThe DP matrix (2D vector).
filenameThe output filename.

Variable Documentation

◆ binary

bool binary = false

◆ GAP_EXTEND

double GAP_EXTEND = -1.0

◆ GAP_OPEN

double GAP_OPEN = -5.0

Gap penalty.

◆ txt

bool txt = false

◆ verbose

bool verbose = false