BLAST MATLAB

Da Bioingegneria Elettronica e Informatica.

Nicola Altini nicola.altini@poliba.it

Vitoantonio Bevilacqua vitoantonio.bevilacqua@poliba.it

Slides

BLAST Examples with MATLAB

Download .fna and .faa files

  1. % The following examples assume you have a FASTA nucleotide file and a FASTA amino acid file for E. coli:
  2. % NC_004431.fna and NC_004431.faa, saved inside path_fasta_files.
  3.  
  4. path_fasta_files = "MATLAB/BLAST/fasta_files";
  5.  
  6. fullURL = 'http://www.scfbio-iitd.res.in/chemgenome/genomedataset/NC_004431.fna';
  7. filename_fna = fullfile(path_fasta_files, 'NC_004431.fna');
  8. urlwrite(fullURL,filename_fna);
  9.  
  10. fullURL = 'https://www.brinkman.mbb.sfu.ca/~mlangill/public_tmp/NC_004431.faa';
  11. filename_faa = fullfile(path_fasta_files, 'NC_004431.faa');
  12. urlwrite(fullURL,filename_faa);

Prepare database and query

  1. % Create local blastable databases from the NC_004431.fna and NC_004431.faa FASTA files by using the blastformat function.
  2. blastformat('inputdb', filename_fna, 'protein', 'false', 'log', 'formatdb.log');
  3. blastformat('inputdb', filename_faa, 'log', 'formatdb.log');
  4.  
  5. % Use the getgenbank function to retrieve sequence information for the
  6. % E. coli threonine operon from the GenBank database.
  7. S = getgenbank('M28570');
  8.  
  9. % Create a query file by using the fastawrite function to create a
  10. % FASTA file named query_nt.fa from this sequence information, using
  11. % only the accession number as the header.
  12. S.Header = S.Accession;
  13. path_query = fullfile(path_fasta_files, 'query_nt.fa');
  14. fastawrite(path_query, S);

Examples

Example 1. Search nucleotide query versus nucleotide data

  1. results = blastlocal('inputquery', path_query,...
  2.                      'database', filename_fna,...
  3.                      'program',  'blastn');
  4.  
  5. print_score_results(results(1));

Example 2. Performing a Nucleotide Translated Search

  1. % Use MATLAB syntax to submit the query sequence in the query_nt.fa FASTA file for a BLAST search 
  2. % of the local amino acid database NC_004431.faa. 
  3. % Specify the BLAST program blastx. Return the BLAST search results in results, a MATLAB structure.
  4.  
  5. results = blastlocal('inputquery', path_query,...
  6.                      'database', filename_faa,...
  7.                      'program',  'blastx');

Example 3. Performing a Nucleotide Search Using blastall Syntax

  1. % Use blastall syntax to submit the query sequence in the query_nt.fa FASTA file for a BLAST search
  2. % of the local nucleotide database NC_004431.fna. 
  3. % Specify the BLAST program blastn and an expectation value of 0.0001. 
  4. % Return the BLAST search results in results, a MATLAB structure.
  5.  
  6. query = sprintf('-i %s -d %s -p blastn -e 0.0001', path_query, filename_fna);
  7. results = blastlocal(query);

Example 4. Performing a Nucleotide Search and Creating a Formatted Report

  1. % Submit the query sequence in the query_nt.fa FASTA file for a BLAST search 
  2. % of the local nucleotide database NC_004431.fna. 
  3. % Specify the BLAST program blastn and a tabular alignment format.
  4. % Save the contents of the BLAST report to a file named myecoli_nt.txt.
  5.  
  6. blast_report_file = fullfile(path_fasta_files, 'myecoli_nt.txt');
  7. blastlocal('inputquery', path_query,...
  8.            'database', filename_fna, ...
  9.            'tofile', blast_report_file, ...
  10.            'blastargs', '-p blastn -m 8');

Utility Functions

  1. function pen = score_from_matrix(char1, char2, varargin)
  2.     if nargin < 3
  3.         pen_matrix = ...
  4.              [+1  -5  -5  -1;
  5.               -5  +1  -1  -5;
  6.               -5  -1  +1  -5;
  7.               -1  -5  -5  +1];
  8.     else
  9.         pen_matrix = varargin{1};
  10.     end
  11.  
  12.     % ATCG
  13.     idx1 = idx_from_atcg(char1);
  14.     idx2 = idx_from_atcg(char2);
  15.  
  16.     if idx1 > 0 && idx2 > 0
  17.         pen = pen_matrix(idx1, idx2);
  18.     else
  19.         pen = 0;
  20.     end
  21. end
  22.  
  23. function idx = idx_from_atcg(char1)
  24.     if char1 == 'a'
  25.         idx = 1;
  26.     elseif char1 == 't'
  27.         idx = 2;
  28.     elseif char1 == 'c'
  29.         idx = 3;
  30.     elseif char1 == 'g'
  31.         idx = 4;
  32.     else
  33.         idx = -1;
  34.     end
  35. end
  36.  
  37. function print_score_results(results)
  38.     n_hits = numel(results.Hits.HSPs);
  39.  
  40.     for idx_hit = 1:n_hits
  41.         fprintf('Index HIT = %d\n', idx_hit);
  42.         alignment = results.Hits.HSPs(idx_hit).Alignment;
  43.         rew = results.Hits.HSPs(idx_hit).Identities.Match * 2;
  44.  
  45.         empty_pos = [];
  46.         ncols = size(alignment,2);
  47.         for i = 1:ncols
  48.             if alignment(2,i) == ' '
  49.                 empty_pos = [empty_pos, i];
  50.             end
  51.         end
  52.  
  53.         pen = 0;
  54.         for i = 1:numel(empty_pos)
  55.             empty_pos_idx = empty_pos(i);
  56.             char1 = alignment(1,empty_pos_idx);
  57.             char2 = alignment(3,empty_pos_idx);
  58.             fprintf('char1 = %c char2 = %c\n', char1, char2);
  59.             pen_iter = score_from_matrix(char1, char2);
  60.             pen = pen + pen_iter;
  61.         end
  62.         fprintf('Penalty is %d\n', pen);
  63.         score = rew + pen;
  64.         fprintf('Final score is %d\n', score);
  65.         fprintf('\n');
  66.     end
  67. end