Genetic Algorithms for MSA (MATLAB)

Da Bioingegneria Elettronica e Informatica.

Nicola Altini nicola.altini@poliba.it

Vitoantonio Bevilacqua vitoantonio.bevilacqua@poliba.it

Slides

Genetic Algorithms for Multiple Sequence Alignment

Class msaga

  1. classdef msaga
  2.     %MSAGA Summary of this class goes here
  3.     %   Detailed explanation goes here
  4.  
  5.     properties
  6.         chromosomes
  7.         generations
  8.         min_generations
  9.         mutation_rate
  10.         crossover_prob
  11.     end
  12.  
  13.     methods
  14.         function obj = msaga(chromosomes, generations, min_generations, mutation_rate, crossover_prob)
  15.             %MSAGA Construct an instance of this class
  16.             %   Detailed explanation goes here
  17.             obj.chromosomes = chromosomes;
  18.             obj.generations = generations;
  19.             obj.min_generations = min_generations;
  20.             obj.mutation_rate = mutation_rate;
  21.             obj.crossover_prob = crossover_prob;
  22.         end
  23.  
  24.         function [pop, best_chromosomes, best_values] = run_ga(obj, input_path)
  25.             % Read the input file sequences
  26.             [seq_table] = prepare_input(input_path);
  27.  
  28.             % Show the original chromosome
  29.             fprintf("Input Sequence Table:\n");
  30.             disp(seq_table);
  31.  
  32.             % Initialize the population
  33.             pop = obj.initialize_pop(seq_table);       
  34.  
  35.             % Repeat for all generations or until a good solution is found
  36.             best_val = false;
  37.             best_chromosome = false;
  38.             best_chromosomes = {};
  39.             best_values = [];
  40.             gener_count = 1;
  41.             new_pop = {};
  42.  
  43.             while gener_count < obj.generations
  44.  
  45.                 % Evaluate all chromosomes
  46.                 evaluations = {};
  47.                 for i = 1:size(pop,1)
  48.                     pop{i,1}.evaluation = msaga.eval_function(pop{i,1}.chromosome);
  49.                     evaluations{end+1,1} = pop{i,1}.evaluation;
  50.                 end
  51.  
  52.                 % Repeat for all chromosomes
  53.                 for chromosome_counter = 1:obj.chromosomes
  54.                     % Select two parents
  55.                     [p1, p2] = msaga.select_parents(pop, evaluations);
  56.                     fprintf("\nParent 1:\n");
  57.                     disp(p1.chromosome);
  58.                     fprintf("\nParent 2:\n");
  59.                     disp(p2.chromosome);
  60.  
  61.                     % Get crossover probability
  62.                     crossover_prob = rand();
  63.  
  64.                     % Apply a crossover operation on p1 and p2
  65.                     if crossover_prob < obj.crossover_prob
  66.                         child = msaga.apply_crossover(pop, p1, p2);
  67.  
  68.                         % Apply a mutation on child
  69.                         child.chromosome = obj.apply_mutation(child.chromosome);
  70.  
  71.                         % Add the child to the new population
  72.                         child.chromosome = add_gaps(child.chromosome);
  73.                         child.chromosome = remove_useless_gaps(child.chromosome);
  74.                         new_pop{end+1,1} = child;
  75.  
  76.                         fprintf("\nChild:\n");
  77.                         disp(child.chromosome);
  78.                     end
  79.                 end
  80.  
  81.                 % Get the best chromosome
  82.                 best_val = 0;
  83.                 best_chromosome = false;
  84.                 for i = 1:size(new_pop,1)
  85.                     curr_val = msaga.eval_function(new_pop{i,1}.chromosome);
  86.                     new_pop{i,1}.evaluation = curr_val;
  87.  
  88.                     if curr_val >= best_val
  89.                         best_val = curr_val;
  90.                         best_chromosome = new_pop{i,1}.chromosome;
  91.                     end
  92.                 end
  93.  
  94.                 % Print stats
  95.                 fprintf("Generation num %d: best val %d\n", gener_count, best_val);
  96.  
  97.                 % Add best chromosome to list
  98.                 best_chromosomes{end+1,1} = best_chromosome;
  99.                 best_values(end+1,1) = best_val;
  100.  
  101.                 % Break the execution if there are no relevant changes
  102.                 if obj.check_no_changes(best_values)
  103.                     fprintf("Stop the optimization. No progresses!\n");
  104.                     break;
  105.                 end
  106.  
  107.                 % Update population
  108.                 % Add to the population the new generated chromosomes and
  109.                 % remove the same number of the worst chromosomes from
  110.                 % the original population
  111.                 pop_evaluations = [];
  112.                 for i = 1:size(pop,1)
  113.                     pop_evaluations = [pop_evaluations, pop{i,1}.evaluation];
  114.                 end
  115.                 [sorted_pop, pop_indices] = sort(pop_evaluations, 'descend');
  116.                 new_pop_evaluations = []; 
  117.                 for i = 1:size(new_pop,1)
  118.                     new_pop_evaluations = [new_pop_evaluations, new_pop{i,1}.evaluation];
  119.                 end
  120.                 [sorted_new_pop, new_pop_indices] = sort(new_pop_evaluations, 'descend');
  121.  
  122.                 for i = 1:size(new_pop,1)
  123.                     index_pop = pop_indices( numel(pop_indices)-numel(new_pop_indices)+i );
  124.                     index_new_pop = new_pop_indices (i);
  125.                     pop{index_pop,1} = new_pop{index_new_pop,1}; 
  126.                 end
  127.  
  128.                 new_pop = {};
  129.                 gener_count = gener_count + 1;
  130.             end
  131.         end
  132.  
  133.         function [change] = check_no_changes(obj, best_values)
  134.             num_best_chromosomes = size(best_values, 1);
  135.             if num_best_chromosomes < obj.min_generations
  136.                 change = false;
  137.                 return;
  138.             else
  139.                 percent = round(0.2 * num_best_chromosomes);
  140.                 if percent < 2
  141.                     percent = 2;
  142.                 end
  143.                 last_best_values = best_values(end-percent:end,1);
  144.                 var_lbv = var(last_best_values);
  145.                 if var_lbv < 1.05
  146.                     change = true;
  147.                     return;
  148.                 end
  149.             end
  150.             change = false;
  151.         end    
  152.  
  153.         function [child] = apply_mutation(obj, child)
  154.             n = size(child,1);
  155.  
  156.  
  157.             % Select mutation method and apply
  158.             if rand() < obj.mutation_rate
  159.                 fprintf("\nChild before mutation:\n");
  160.                 disp(child);
  161.  
  162.                 rand_value = rand();
  163.  
  164.                 % Apply gaps removal
  165.                 if rand_value < 0.5
  166.                     cell_i = randi([1,n]);
  167.                     cell_j = randi([1,strlength(child{cell_i,1})]);
  168.  
  169.                     child_i = char(child{cell_i,1});
  170.                     child_i_j = child_i(cell_j);
  171.                     while child_i_j ~= "-"
  172.                         cell_i = randi([1,n]);
  173.                         cell_j = randi([1,strlength(child{cell_i,1})]);
  174.                         child_i = char(child{cell_i,1});
  175.                         child_i_j = child_i(cell_j);
  176.                     end
  177.  
  178.                     % Get extending gap
  179.                     gaps = get_interval_gaps(child, cell_i, cell_j);
  180.                     start_gap = gaps{1,1};
  181.                     end_gap = gaps{end,1};
  182.                     if start_gap ~= end_gap
  183.                         fprintf("Start gap = %d\tEnd gap = %d\n", start_gap,end_gap);                   
  184.                     end
  185.  
  186.                     % Remove gaps
  187.                     start_end_child = char(child{cell_i,1});
  188.                     start_child = start_end_child(1:start_gap);           
  189.                     end_child = start_end_child(end_gap+1:end);
  190.                     child{cell_i,1} = string(start_child) + string(end_child);                 
  191.  
  192.                 % Apply k condition (add gaps)
  193.                 else
  194.                     cell_i = randi([1,n]);
  195.                     cell_j = randi([1,strlength(child{cell_i,1})]);
  196.                     k = randi(1, ceil(0.1 * calc_max_length(child)));
  197.                     to_add = "";
  198.  
  199.                     for i = 1:k
  200.                         to_add = to_add + "-";
  201.                     end
  202.  
  203.                     % child[cell_i] = child[cell_i][:cell_j] + to_add + child[cell_i][cell_j:]
  204.                     start_end_child = char(child{cell_i,1});
  205.                     start_child = start_end_child(1:cell_j);           
  206.                     end_child = start_end_child(cell_j+1:end);
  207.                     child{cell_i} = string(start_child) + to_add + string(end_child);
  208.  
  209.                 end
  210.  
  211.                 fprintf("\nChild after mutation:\n");
  212.                 disp(child);
  213.             end
  214.  
  215.         end
  216.  
  217.         function [pop] = initialize_pop(obj, seq_table)
  218.             pop = {};
  219.             for chromosome_counter = 1:obj.chromosomes
  220.  
  221.                 % Create new chromosome
  222.                 new_chromosome = {};
  223.  
  224.                 % Use nwalign to compute pairwise alignments
  225.                 % by the Needleman-Wunsch algorithm
  226.                 for i = 1:size(seq_table,1)
  227.                     alignments = {};
  228.  
  229.                     % Compute pairwise alignments
  230.                     for j = 1:size(seq_table,1)
  231.                         if i ~= j
  232.                             [current_score, current_align] = nwalign(seq_table{i,1}{1}, seq_table{j,1}{1});
  233.                             alignments{end+1,1} = string(current_align);
  234.                         end
  235.                     end
  236.  
  237.                     % Randomly select an alignment
  238.                     alignment = randsample(alignments, 1);
  239.                     alignment = alignment{1}(1,:);
  240.                     new_chromosome{end+1,1} = alignment;
  241.                 end
  242.  
  243.                 % Add the chromosome generated and prints it
  244.                 new_chromosome = add_gaps(new_chromosome);
  245.                 pop_iter.chromosome = new_chromosome;
  246.                 pop_iter.evaluation = 0;
  247.                 pop{end+1,1} = pop_iter;
  248.                 fprintf("\nChromosome %3d on %3d:\n", chromosome_counter, obj.chromosomes);
  249.                 disp(new_chromosome);
  250.             end
  251.         end
  252.  
  253.     end
  254.  
  255.     methods(Static)
  256.         function [sum_score] = eval_function(chromosome)
  257.             sum_score = 0;
  258.             for i = 2:size(chromosome,1)                                
  259.                 % Compute pairwise scores
  260.                 for j = 1:(i-1)
  261.                     [curr_score, ~] = nwalign(chromosome{i,1}, chromosome{j,1}, ...
  262.                                               'ScoringMatrix', 'BLOSUM62', ...
  263.                                               'GapOpen', 1, ...
  264.                                               'ExtendGap', 0.5);
  265.                     sum_score = sum_score + curr_score;
  266.                 end
  267.             end
  268.         end
  269.  
  270.         function [p1, p2] = select_parents(pop, evaluations)
  271.             % Normalize the fitness
  272.             pop_sum = sum(cell2mat(evaluations));
  273.             evaluations_aux = {};
  274.             for i = 1:size(evaluations,1)
  275.                 evaluations_aux{i,1} = evaluations{i,1} / pop_sum;
  276.             end
  277.  
  278.             % Build the mating pool
  279.             pool = {};
  280.             for i = 1:size(pop,1)
  281.                 prob = ceil(evaluations_aux{i,1} * 100);
  282.                 for j = 1:prob
  283.                     pool{end+1,1} = pop{i,1};
  284.                 end
  285.             end
  286.  
  287.             % Randomly select two parents
  288.             p1 = randsample(pool,1);
  289.             p1 = p1{1};
  290.             p2 = randsample(pool,1);
  291.             p2 = p2{1};
  292.         end
  293.  
  294.         function [child_struct] = apply_crossover(pop, p1, p2)
  295.             n = numel(pop{1,1}.chromosome);
  296.  
  297.             % Apply horizontal crossover
  298.             rand_horizontal = randi([1,n-1]);
  299.             child = {};
  300.             for i = 1:n
  301.                 if i <= rand_horizontal
  302.                     child{i,1} = p1.chromosome{i,1};
  303.                 else 
  304.                     child{i,1} = p2.chromosome{i,1};
  305.                 end
  306.             end
  307.             child_struct.chromosome = child;
  308.             child_struct.evaluation = 0;
  309.         end
  310.     end    
  311. end

Helper functions

Add Gaps

  1. function [sequences_table] = add_gaps(sequences_table)
  2. num_sequences = size(sequences_table, 1);
  3. max_length = calc_max_length(sequences_table);
  4. for i = 1:num_sequences
  5.     sequence = sequences_table{i,1}{1};
  6.     length = strlength(sequence);
  7.     if length < max_length
  8.         diff_length = max_length - length;
  9.         for j = 1:diff_length
  10.             sequence = sequence + "-";    
  11.         end
  12.         if istable(sequences_table)
  13.             sequences_table{i,1} = {sequence};
  14.         else
  15.             sequences_table{i,1} = sequence;
  16.         end
  17.     end
  18. end
  19. end

Calc Max Length

  1. function [max_length] = calc_max_length(sequences_table)
  2. num_sequences = size(sequences_table, 1);
  3. max_length = 0;
  4. for i = 1:num_sequences
  5.     length = strlength(sequences_table{i,1}{1});
  6.     if length > max_length
  7.         max_length = length;
  8.     end
  9. end
  10. end

Get Interval Gaps

  1. function [gaps] = get_interval_gaps(lines, cell_i, cell_j)
  2. aux_cell_j = cell_j;
  3. gaps = {};
  4. symbol_found = false;
  5.  
  6. % Find gaps after cell_j (inclusive)
  7. while ~symbol_found
  8.     if cell_j > strlength(lines{cell_i,1})
  9.         break;
  10.     else
  11.         lines_i = char(lines{cell_i,1});
  12.         lines_i_j = lines_i(cell_j);
  13.         if lines_i_j == "-"
  14.             gaps{end+1,1} = cell_j;
  15.             cell_j = cell_j + 1;
  16.         else 
  17.             symbol_found = true;
  18.         end 
  19.     end
  20. end
  21.  
  22. % Find gaps before cell_j (exclusive)
  23. aux_cell_j = aux_cell_j - 1;
  24. while ~symbol_found
  25.     if aux_cell_j < 1
  26.         break;
  27.     else
  28.         lines_i = char(lines{cell_i,1});
  29.         lines_i_j = lines_i(aux_cell_j);
  30.         if lines_i_j == "-"
  31.             gaps = [ { aux_cell_j }; gaps ];
  32.             aux_cell_j = aux_cell_j - 1;
  33.         else
  34.             symbol_found = true;
  35.         end        
  36.     end
  37. end
  38.  
  39. end

Prepare Input

  1. function [seq_table] = prepare_input(input_path)
  2. seq_table = readtable(input_path, 'ReadVariableNames', 0);
  3. seq_table.Properties.VariableNames = "Sequences";
  4. num_sequences = size(seq_table, 1);
  5. for i = 1:num_sequences
  6.     seq_table{i,1}{1} = string(seq_table{i,1}{1});
  7. end
  8. [seq_table] = add_gaps(seq_table);
  9. end

Remove Useless Gaps

  1. function [sequences_table] = remove_useless_gaps(sequences_table)
  2.     to_rm = {};
  3.     only_gaps = true;
  4.     max_length = calc_max_length(sequences_table);
  5.  
  6.     % Find columns with gaps only
  7.     for i = 1:max_length
  8.         for j = 1:size(sequences_table,1)
  9.             chromosome_j = char(sequences_table{j,1});
  10.             chromosome_j_i = chromosome_j(i);
  11.             if chromosome_j_i ~= " " && chromosome_j_i ~= "-"
  12.                 only_gaps = false;
  13.             end
  14.         end
  15.  
  16.         if only_gaps
  17.             to_rm{end+1,1} = i;
  18.         else
  19.             only_gaps = true;
  20.         end
  21.     end
  22.  
  23.     % Remove gap-only columns
  24.     to_rm = flip(to_rm);
  25.     for i = 1:size(sequences_table,1)
  26.         line = [];
  27.         for j = 1:max_length
  28.             chromosome_i = char(sequences_table{i,1});
  29.             chromosome_i_j = chromosome_i(j);
  30.             line = [line, chromosome_i_j];
  31.         end
  32.  
  33.         for k = 1:numel(to_rm)
  34.             to_rm_el = to_rm{k,1};
  35.             line(to_rm_el) = [];
  36.         end
  37.  
  38.         sequences_table{i,1} = string(line);
  39.     end
  40. end